66 files changed, 3511 insertions, 1152 deletions
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_Tensor_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index eeca2f69e..02146527b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions.
 
 As a special case, if you pass no parameter to a reduction operation the
 original tensor is reduced along *all* its dimensions.  The result is a
-one-dimension tensor with a single value.
+scalar, represented as a zero-dimension tensor.
 
     Eigen::Tensor<float, 3> a(2, 3, 4);
     a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
@@ -1112,7 +1112,7 @@ one-dimension tensor with a single value.
                   {19.0f, 18.0f, 17.0f, 16.0f},
                   {20.0f, 21.0f, 22.0f, 23.0f}}});
     // Reduce along all dimensions using the sum() operator.
-    Eigen::Tensor<float, 1> b = a.sum();
+    Eigen::Tensor<float, 0> b = a.sum();
     cout << "b" << endl << b << endl << endl;
     =>
     b
@@ -1168,6 +1168,44 @@ Reduce a tensor using a user-defined reduction operator.  See ```SumReducer```
 in TensorFunctors.h for information on how to implement a reduction operator.
 
 
+## Scan Operations
+
+A *Scan* operation returns a tensor with the same dimensions as the original
+tensor. The operation performs an inclusive scan along the specified
+axis, which means it computes a running total along the axis for a given
+reduction operation.
+If the reduction operation corresponds to summation, then this computes the
+prefix sum of the tensor along the given axis.
+
+Example:
+dd a comment to this line
+
+    // Create a tensor of 2 dimensions
+    Eigen::Tensor<int, 2> a(2, 3);
+    a.setValues({{1, 2, 3}, {4, 5, 6}});
+    // Scan it along the second dimension (1) using summation
+    Eigen::Tensor<int, 2> b = a.cumsum(1);
+    // The result is a tensor with the same size as the input
+    cout << "a" << endl << a << endl << endl;
+    cout << "b" << endl << b << endl << endl;
+    =>
+    a
+    1 2 3
+    6 5 4
+
+    b
+    1  3  6
+    4  9 15
+
+### <Operation> cumsum(const Index& axis)
+
+Perform a scan by summing consecutive entries.
+
+### <Operation> cumprod(const Index& axis)
+
+Perform a scan by multiplying consecutive entries.
+
+
 ## Convolutions
 
 ### <Operation> convolve(const Kernel& kernel, const Dimensions& dims)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
index 759dede3f..1940a9692 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/Tensor.h
@@ -110,7 +110,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     inline Self& base()             { return *this; }
     inline const Self& base() const { return *this; }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC inline const Scalar& coeff(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
@@ -150,7 +150,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& coeffRef(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
@@ -190,7 +190,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return m_storage.data()[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
@@ -257,7 +257,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     inline Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
@@ -336,7 +336,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index firstDimension, IndexTypes... otherDimensions)
         : m_storage(firstDimension, otherDimensions...)
@@ -350,22 +350,22 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
     {
       EIGEN_STATIC_ASSERT(1 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2)
       : m_storage(dim1*dim2, array<Index, 2>(dim1, dim2))
     {
       EIGEN_STATIC_ASSERT(2 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3)
       : m_storage(dim1*dim2*dim3, array<Index, 3>(dim1, dim2, dim3))
     {
       EIGEN_STATIC_ASSERT(3 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4)
       : m_storage(dim1*dim2*dim3*dim4, array<Index, 4>(dim1, dim2, dim3, dim4))
     {
       EIGEN_STATIC_ASSERT(4 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
-    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE explicit Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Tensor(Index dim1, Index dim2, Index dim3, Index dim4, Index dim5)
       : m_storage(dim1*dim2*dim3*dim4*dim5, array<Index, 5>(dim1, dim2, dim3, dim4, dim5))
     {
       EIGEN_STATIC_ASSERT(5 == NumIndices, YOU_MADE_A_PROGRAMMING_MISTAKE)
@@ -418,7 +418,7 @@ class Tensor : public TensorBase<Tensor<Scalar_, NumIndices_, Options_, IndexTyp
       return *this;
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     void resize(Index firstDimension, IndexTypes... otherDimensions)
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
index babafe108..d06f40cd8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorArgMax.h
@@ -254,6 +254,14 @@ struct TensorEvaluator<const TensorTupleReducerOp<ReduceOp, Dims, ArgType>, Devi
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double compute_cost = 1.0 +
+        (m_return_dim < 0 ? 0.0 : (TensorOpCost::ModCost<Index>() + TensorOpCost::DivCost<Index>()));
+    return m_orig_impl.costPerCoeff(vectorized) +
+           m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, compute_cost);
+  }
+
  private:
   EIGEN_DEVICE_FUNC void gen_strides(const InputDimensions& dims, StrideDims& strides) {
     if (m_return_dim < 0) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 1a34f3ccc..7a45a5cf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
       return unaryExpr(internal::scalar_abs_op<Scalar>());
@@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
     pow(Scalar exponent) const {
-      return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+    real() const {
+      return unaryExpr(internal::scalar_real_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+    imag() const {
+      return unaryExpr(internal::scalar_imag_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
     operator+ (Scalar rhs) const {
-      return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+    operator+ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
     operator- (Scalar rhs) const {
       EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+    operator- (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
     operator* (Scalar rhs) const {
-      return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+    operator* (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
     operator/ (Scalar rhs) const {
-      return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+    operator/ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
@@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_floor_op<Scalar>());
     }
 
-
     // Generic binary operation support.
     template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
@@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors>
 
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
     operator<(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
     operator<=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
     operator>(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
     operator>=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
     operator==(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
     operator!=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
     }
 
     // comparisons and tests for Scalars
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<(Scalar threshold) const {
       return operator<(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<=(Scalar threshold) const {
       return operator<=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>(Scalar threshold) const {
       return operator>(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>=(Scalar threshold) const {
       return operator>=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator==(Scalar threshold) const {
       return operator==(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator!=(Scalar threshold) const {
       return operator!=(constant(threshold));
     }
@@ -453,6 +498,28 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return TensorFFTOp<const FFT, const Derived, FFTDataType, FFTDirection>(derived(), fft);
     }
 
+    // Scan.
+    typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanSumOp
+    cumsum(const Index& axis, bool exclusive = false) const {
+      return TensorScanSumOp(derived(), axis, exclusive);
+    }
+
+    typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanProdOp
+    cumprod(const Index& axis, bool exclusive = false) const {
+      return TensorScanProdOp(derived(), axis, exclusive);
+    }
+
+    template <typename Reducer>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanOp<Reducer, const Derived>
+    scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+      return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
+    }
+
     // Reductions.
     template <typename Dims> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorReductionOp<internal::SumReducer<CoeffReturnType>, const Dims, const Derived>
@@ -676,6 +743,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     slice(const StartIndices& startIndices, const Sizes& sizes) const {
       return TensorSlicingOp<const StartIndices, const Sizes, const Derived>(derived(), startIndices, sizes);
     }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
     template <Index DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
@@ -750,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
 
-template<typename Derived>
-class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
  public:
     typedef internal::traits<Derived> DerivedTraits;
     typedef typename DerivedTraits::Scalar Scalar;
@@ -761,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
 
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setZero() {
@@ -780,7 +853,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
       return derived() = this->template random<RandomGenerator>();
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setValues(
         const typename internal::Initializer<Derived, NumDimensions>::InitList& vals) {
@@ -851,6 +924,19 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
       return TensorSlicingOp<const StartIndices, const Sizes, Derived>(derived(), startIndices, sizes);
     }
 
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, const Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) const {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                const Derived>(derived(), startIndices, stopIndices, strides);
+    }
+    template <typename StartIndices, typename StopIndices, typename Strides> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides, Derived>
+    stridedSlice(const StartIndices& startIndices, const StopIndices& stopIndices, const Strides& strides) {
+      return TensorStridingSlicingOp<const StartIndices, const StopIndices, const Strides,
+                                Derived>(derived(), startIndices, stopIndices, strides);
+    }
+
     template <DenseIndex DimId> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorChippingOp<DimId, const Derived>
     chip(const Index offset) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
index c771496e2..5d67f69f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBroadcasting.h
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
@@ -118,7 +118,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     // The broadcasting op doesn't change the rank of the tensor. One can't broadcast a scalar
     // and store the result in a scalar. Instead one should reshape the scalar into a a N-D
     // tensor with N >= 1 of 1 element first and then broadcast.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     const InputDimensions& input_dims = m_impl.dimensions();
     const Broadcast& broadcast = op.broadcast();
     for (int i = 0; i < NumDims; ++i) {
@@ -247,7 +247,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
@@ -299,7 +299,7 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index originalIndex = index;
@@ -354,11 +354,11 @@ struct TensorEvaluator<const TensorBroadcastingOp<Broadcast, ArgType>, Device>
     if (NumDims > 0) {
       for (int i = NumDims - 1; i > 0; --i) {
         compute_cost += TensorOpCost::DivCost<Index>();
-        if (internal::index_statically_eq<Broadcast>()(i, 1)) {
+        if (internal::index_statically_eq<Broadcast>(i, 1)) {
           compute_cost +=
               TensorOpCost::MulCost<Index>() + TensorOpCost::AddCost<Index>();
         } else {
-          if (!internal::index_statically_eq<InputDimensions>()(i, 1)) {
+          if (!internal::index_statically_eq<InputDimensions>(i, 1)) {
             compute_cost += TensorOpCost::MulCost<Index>() +
                             TensorOpCost::ModCost<Index>() +
                             TensorOpCost::AddCost<Index>();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
index 2742dbb95..1ba7ef170 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorChipping.h
@@ -152,8 +152,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_dim(op.dim()), m_device(device)
   {
-    // We could also support the case where NumInputDims==1 if needed.
-    EIGEN_STATIC_ASSERT(NumInputDims >= 2, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumInputDims >= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
     eigen_assert(NumInputDims > m_dim.actualDim());
 
     const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
@@ -203,7 +202,7 @@ struct TensorEvaluator<const TensorChippingOp<DimId, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if ((static_cast<int>(Layout) == static_cast<int>(ColMajor) && m_dim.actualDim() == 0) ||
@@ -342,7 +341,7 @@ struct TensorEvaluator<TensorChippingOp<DimId, ArgType>, Device>
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
     if ((static_cast<int>(this->Layout) == static_cast<int>(ColMajor) && this->m_dim.actualDim() == 0) ||
 	(static_cast<int>(this->Layout) == static_cast<int>(RowMajor) && this->m_dim.actualDim() == NumInputDims-1)) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
index 839c6e3e5..59bf90d93 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConcatenation.h
@@ -128,8 +128,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
     : m_leftImpl(op.lhsExpression(), device), m_rightImpl(op.rhsExpression(), device), m_axis(op.axis())
   {
     EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<LeftArgType, Device>::Layout) == static_cast<int>(TensorEvaluator<RightArgType, Device>::Layout) || NumDims == 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(NumDims == RightNumDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims == RightNumDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     eigen_assert(0 <= m_axis && m_axis < NumDims);
     const Dimensions& lhs_dims = m_leftImpl.dimensions();
@@ -248,8 +248,8 @@ struct TensorEvaluator<const TensorConcatenationOp<Axis, LeftArgType, RightArgTy
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
@@ -344,8 +344,8 @@ template<typename Axis, typename LeftArgType, typename RightArgType, typename De
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    static const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index + packetSize - 1 < this->dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX CoeffReturnType values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 6f113b903..20b29e5fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
-                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+                               typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
   typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                         typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -37,7 +38,7 @@ struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
   typedef typename remove_reference<RhsNested>::type _RhsNested;
 
   // From NumDims below.
-  static const int NumDimensions = max_n_1<traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value>::size;
+  static const int NumDimensions = traits<RhsXprType>::NumDimensions + traits<RhsXprType>::NumDimensions - 2 * array_size<Dimensions>::value;
   static const int Layout = traits<LhsXprType>::Layout;
 
   enum {
@@ -65,7 +66,7 @@ struct traits<TensorEvaluator<const TensorContractionOp<Indices_, LeftArgType_,
   typedef Device_ Device;
 
   // From NumDims below.
-  static const int NumDimensions = max_n_1<traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value>::size;
+  static const int NumDimensions = traits<LeftArgType_>::NumDimensions + traits<RightArgType_>::NumDimensions - 2 * array_size<Indices_>::value;
 };
 
 }  // end namespace internal
@@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
-                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
@@ -140,11 +141,11 @@ struct TensorContractionEvaluatorBase
   static const int RDims =
       internal::array_size<typename TensorEvaluator<EvalRightArgType, Device>::Dimensions>::value;
   static const int ContractDims = internal::array_size<Indices>::value;
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
@@ -218,11 +219,9 @@ struct TensorContractionEvaluatorBase
       rhs_strides[i+1] = rhs_strides[i] * eval_right_dims[i];
     }
 
-    m_i_strides[0] = 1;
-    m_j_strides[0] = 1;
-    if(ContractDims) {
-        m_k_strides[0] = 1;
-    }
+    if (m_i_strides.size() > 0) m_i_strides[0] = 1;
+    if (m_j_strides.size() > 0) m_j_strides[0] = 1;
+    if (m_k_strides.size() > 0) m_k_strides[0] = 1;
 
     m_i_size = 1;
     m_j_size = 1;
@@ -318,11 +317,6 @@ struct TensorContractionEvaluatorBase
       }
     }
 
-    // Scalar case. We represent the result as a 1d tensor of size 1.
-    if (LDims + RDims == 2 * ContractDims) {
-      m_dimensions[0] = 1;
-    }
-
     // If the layout is RowMajor, we need to reverse the m_dimensions
     if (static_cast<int>(Layout) == static_cast<int>(RowMajor)) {
       for (int i = 0, j = NumDims - 1; i < j; i++, j--) {
@@ -510,7 +504,7 @@ struct TensorContractionEvaluatorBase
 
           // call gebp (matrix kernel)
           // The parameters here are copied from Eigen's GEMM implementation
-          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, 1.0, -1, -1, 0, 0);
+          gebp(output.getSubMapper(i2, j2), blockA, blockB, actual_mc, actual_kc, actual_nc, Scalar(1), -1, -1, 0, 0);
         }
       }
     }
@@ -607,15 +601,14 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   static const int ContractDims = internal::array_size<Indices>::value;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   // Could we use NumDimensions here?
   typedef DSizes<Index, NumDims> Dimensions;
 
-
   EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) { }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 6a3ef14ef..d65dbb40f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 #undef writeResultShmem
 #undef writeRow
 
-  const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
 
   if (threadIdx.x < max_i_write) {
     if (max_j_write == 8) {
@@ -1240,10 +1240,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index b27e1a1b4..9b2cb3ff6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -130,19 +130,19 @@ class SimpleTensorContractionMapper {
     }
 
     Index contract_val = left ? col : row;
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx = contract_val / m_k_strides[i];
-      linidx += idx * m_contract_strides[i];
-      contract_val -= idx * m_k_strides[i];
-    }
-
     if(array_size<contract_t>::value > 0) {
-        if (side == Rhs && inner_dim_contiguous) {
-            eigen_assert(m_contract_strides[0] == 1);
-            linidx += contract_val;
-        } else {
-            linidx += contract_val * m_contract_strides[0];
-        }
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx = contract_val / m_k_strides[i];
+        linidx += idx * m_contract_strides[i];
+        contract_val -= idx * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx += contract_val;
+      } else {
+        linidx += contract_val * m_contract_strides[0];
+      }
     }
 
     return linidx;
@@ -153,15 +153,15 @@ class SimpleTensorContractionMapper {
     const bool left = (side == Lhs);
     Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
     Index linidx[2] = {0, 0};
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
-      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
-      linidx[0] += idx0 * m_nocontract_strides[i];
-      linidx[1] += idx1 * m_nocontract_strides[i];
-      nocontract_val[0] -= idx0 * m_ij_strides[i];
-      nocontract_val[1] -= idx1 * m_ij_strides[i];
-    }
     if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+        linidx[0] += idx0 * m_nocontract_strides[i];
+        linidx[1] += idx1 * m_nocontract_strides[i];
+        nocontract_val[0] -= idx0 * m_ij_strides[i];
+        nocontract_val[1] -= idx1 * m_ij_strides[i];
+      }
       if (side == Lhs && inner_dim_contiguous) {
         eigen_assert(m_nocontract_strides[0] == 1);
         linidx[0] += nocontract_val[0];
@@ -173,22 +173,24 @@ class SimpleTensorContractionMapper {
     }
 
     Index contract_val[2] = {left ? col : row, left ? col : row + distance};
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = contract_val[0] / m_k_strides[i];
-      const Index idx1 = contract_val[1] / m_k_strides[i];
-      linidx[0] += idx0 * m_contract_strides[i];
-      linidx[1] += idx1 * m_contract_strides[i];
-      contract_val[0] -= idx0 * m_k_strides[i];
-      contract_val[1] -= idx1 * m_k_strides[i];
-    }
+    if (array_size<contract_t>::value> 0) {
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = contract_val[0] / m_k_strides[i];
+        const Index idx1 = contract_val[1] / m_k_strides[i];
+        linidx[0] += idx0 * m_contract_strides[i];
+        linidx[1] += idx1 * m_contract_strides[i];
+        contract_val[0] -= idx0 * m_k_strides[i];
+        contract_val[1] -= idx1 * m_k_strides[i];
+      }
 
-    if (side == Rhs && inner_dim_contiguous) {
-      eigen_assert(m_contract_strides[0] == 1);
-      linidx[0] += contract_val[0];
-      linidx[1] += contract_val[1];
-    } else {
-      linidx[0] += contract_val[0] * m_contract_strides[0];
-      linidx[1] += contract_val[1] * m_contract_strides[0];
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx[0] += contract_val[0];
+        linidx[1] += contract_val[1];
+      } else {
+        linidx[0] += contract_val[0] * m_contract_strides[0];
+        linidx[1] += contract_val[1] * m_contract_strides[0];
+      }
     }
     return IndexPair<Index>(linidx[0], linidx[1]);
   }
@@ -200,7 +202,7 @@ class SimpleTensorContractionMapper {
     return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
-    return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
   }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index 9044454fd..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -14,6 +14,8 @@
 #ifdef EIGEN_USE_THREADS
 
 namespace Eigen {
+
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
 namespace internal {
 
 template<typename LhsScalar, typename LhsMapper, typename Index>
@@ -52,7 +54,7 @@ struct packRhsAndKernelArg {
 };
 
 }  // end namespace internal
-
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
 
 template<typename Indices, typename LeftArgType, typename RightArgType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@@ -92,10 +94,10 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   typedef array<Index, RDims> right_dim_mapper_t;
 
   typedef array<Index, ContractDims> contract_t;
-  typedef array<Index, max_n_1<LDims - ContractDims>::size> left_nocontract_t;
-  typedef array<Index, max_n_1<RDims - ContractDims>::size> right_nocontract_t;
+  typedef array<Index, LDims - ContractDims> left_nocontract_t;
+  typedef array<Index, RDims - ContractDims> right_nocontract_t;
 
-  static const int NumDims = max_n_1<LDims + RDims - 2 * ContractDims>::size;
+  static const int NumDims = LDims + RDims - 2 * ContractDims;
 
   typedef DSizes<Index, NumDims> Dimensions;
 
@@ -110,6 +112,623 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
 
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
+  template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
+            bool rhs_inner_dim_reordered, int Alignment>
+  void evalProduct(Scalar* buffer) const {
+    typedef
+        typename internal::remove_const<typename EvalLeftArgType::Scalar>::type
+            LhsScalar;
+    typedef
+        typename internal::remove_const<typename EvalRightArgType::Scalar>::type
+            RhsScalar;
+    typedef typename internal::gebp_traits<LhsScalar, RhsScalar> Traits;
+    typedef TensorEvaluator<EvalLeftArgType, Device> LeftEvaluator;
+    typedef TensorEvaluator<EvalRightArgType, Device> RightEvaluator;
+    typedef internal::TensorContractionInputMapper<
+        LhsScalar, Index, internal::Lhs, LeftEvaluator, left_nocontract_t,
+        contract_t, internal::packet_traits<LhsScalar>::size,
+        lhs_inner_dim_contiguous, false, Unaligned>
+        LhsMapper;
+    typedef internal::TensorContractionInputMapper<
+        RhsScalar, Index, internal::Rhs, RightEvaluator, right_nocontract_t,
+        contract_t, internal::packet_traits<RhsScalar>::size,
+        rhs_inner_dim_contiguous, rhs_inner_dim_reordered, Unaligned>
+        RhsMapper;
+    typedef internal::blas_data_mapper<Scalar, Index, ColMajor> OutputMapper;
+    typedef internal::gemm_pack_lhs<LhsScalar, Index,
+                                    typename LhsMapper::SubMapper, Traits::mr,
+                                    Traits::LhsProgress, ColMajor>
+        LhsPacker;
+    typedef internal::gemm_pack_rhs<
+        RhsScalar, Index, typename RhsMapper::SubMapper, Traits::nr, ColMajor>
+        RhsPacker;
+    typedef internal::gebp_kernel<LhsScalar, RhsScalar, Index, OutputMapper,
+                                  Traits::mr, Traits::nr, false, false>
+        GebpKernel;
+
+    const Index m = this->m_i_size;
+    const Index n = this->m_j_size;
+    const Index k = this->m_k_size;
+    if (m == 0 || n == 0 || k == 0) return;
+
+    // Compute a set of algorithm parameters:
+    // - kernel block sizes (bm, bn, bk)
+    // - task grain sizes (number of kernels executed per task: gm, gn)
+    // - number of threads
+    // - sharding by row/column
+    // - parallel packing or first lhs then rhs
+    // and some derived parameters:
+    // - number of tasks (nm, nn, nk)
+    // - number of kernels (nm0, nn0)
+    // Unfortunately, all these parameters are tightly interdependent.
+    // So in some cases we first compute approximate values, then compute other
+    // values based on these approximations and then refine the approximations.
+
+    // There are lots of heuristics here. There is some reasoning behind them,
+    // but ultimately they are just tuned on contraction benchmarks for
+    // different input configurations, thread counts and instruction sets.
+    // So feel free to question any of them.
+
+    // Compute whether we want to shard by row or by column.
+    // This is a first approximation, it will be refined later. Since we don't
+    // know number of threads yet we use 2, because what's we are most
+    // interested in at this point is whether it makes sense to use
+    // parallelization at all or not.
+    bool shard_by_col = shardByCol(m, n, 2);
+
+    // First approximation of kernel blocking sizes.
+    // Again, we don't know number of threads yet, so we use 2.
+    Index bm, bn, bk;
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, 2);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Compute optimal number of threads.
+    // Note: we use bk instead of k here because we are interested in amount of
+    // _parallelizable_ computations, and computations are not parallelizable
+    // across k dimension.
+    const TensorOpCost cost =
+        contractionCost(m, n, bm, bn, bk, shard_by_col, false);
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+        static_cast<double>(n) * m, cost, this->m_device.numThreads());
+
+    // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
+    // model is not tuned. Remove this when the cost model is tuned.
+    if (n == 1) num_threads = 1;
+
+    if (num_threads == 1) {
+      // The single-threaded algorithm should be faster in this case.
+      if (n == 1)
+        this->template evalGemv<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
+      else
+        this->template evalGemm<lhs_inner_dim_contiguous,
+                                rhs_inner_dim_contiguous,
+                                rhs_inner_dim_reordered, Alignment>(buffer);
+      return;
+    }
+
+    // Now that we know number of threads, recalculate sharding and blocking.
+    shard_by_col = shardByCol(m, n, num_threads);
+    if (shard_by_col) {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByCol>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    } else {
+      internal::TensorContractionBlocking<LhsMapper, RhsMapper, Index,
+                                          internal::ShardByRow>
+          blocking(k, m, n, num_threads);
+      bm = blocking.mc();
+      bn = blocking.nc();
+      bk = blocking.kc();
+    }
+
+    // Number of kernels for each dimension.
+    Index nm0 = divup(m, bm);
+    Index nn0 = divup(n, bn);
+    Index nk = divup(k, bk);
+
+    // Calculate task grain size (number of kernels executed per task).
+    // This task size coarsening serves two purposes:
+    // 1. It reduces per-task overheads including synchronization overheads.
+    // 2. It allows to use caches better (reuse the same packed rhs in several
+    // consecutive kernels).
+    Index gm = 1;
+    Index gn = 1;
+    // If we are sharding by column, then we prefer to reduce rows first.
+    if (shard_by_col) {
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+    } else {
+      gn = coarsenN(m, n, bm, bn, bk, gm, num_threads, shard_by_col);
+      gm = coarsenM(m, n, bm, bn, bk, gn, num_threads, shard_by_col);
+    }
+    // Number of tasks in each dimension.
+    Index nm = divup(nm0, gm);
+    Index nn = divup(nn0, gn);
+
+    // Last by not least, decide whether we want to issue both lhs and rhs
+    // packing in parallel; or issue lhs packing first, and then issue rhs
+    // packing when lhs packing completes (for !shard_by_col lhs and rhs are
+    // swapped). Parallel packing allows more parallelism (for both packing and
+    // kernels), while sequential packing provides better locality (once
+    // a thread finishes rhs packing it proceed to kernels with that rhs).
+    // First, we are interested in parallel packing if there are few tasks.
+    bool parallel_pack = num_threads >= nm * nn;
+    // Also do parallel packing if all data fits into L2$.
+    if (m * bk * Index(sizeof(LhsScalar)) + n * bk * Index(sizeof(RhsScalar)) <=
+        l2CacheSize() * num_threads)
+      parallel_pack = true;
+    // But don't do it if we will use each rhs only once. Locality seems to be
+    // more important in this case.
+    if ((shard_by_col ? nm : nn) == 1) parallel_pack = false;
+
+    LhsMapper lhs(this->m_leftImpl, this->m_left_nocontract_strides,
+                  this->m_i_strides, this->m_left_contracting_strides,
+                  this->m_k_strides);
+
+    RhsMapper rhs(this->m_rightImpl, this->m_right_nocontract_strides,
+                  this->m_j_strides, this->m_right_contracting_strides,
+                  this->m_k_strides);
+
+    Context<LhsPacker, RhsPacker, GebpKernel, LhsMapper, RhsMapper,
+            OutputMapper>(this->m_device, num_threads, lhs, rhs, buffer, m, n,
+                          k, bm, bn, bk, nm, nn, nk, gm, gn, nm0, nn0,
+                          shard_by_col, parallel_pack)
+        .run();
+  }
+
+  // Context coordinates a single parallel gemm operation.
+  template <typename LhsPacker, typename RhsPacker, typename GebpKernel,
+            typename LhsMapper, typename RhsMapper, typename OutputMapper>
+  class Context {
+   public:
+    Context(const Device& device, int num_threads, LhsMapper& lhs,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
+            Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
+            Index gn, Index nm0, Index nn0, bool shard_by_col,
+            bool parallel_pack)
+        : device_(device),
+          lhs_(lhs),
+          rhs_(rhs),
+          buffer_(buffer),
+          output_(buffer, tm),
+          num_threads_(num_threads),
+          shard_by_col_(shard_by_col),
+          parallel_pack_(parallel_pack),
+          m_(tm),
+          n_(tn),
+          k_(tk),
+          bm_(bm),
+          bn_(bn),
+          bk_(bk),
+          nm_(nm),
+          nn_(nn),
+          nk_(nk),
+          gm_(gm),
+          gn_(gn),
+          nm0_(nm0),
+          nn0_(nn0)
+  {
+      for (Index x = 0; x < P; x++) {
+        // Normal number of notifications for k slice switch is
+        // nm_ + nn_ + nm_ * nn_. However, first P - 1 slices will receive only
+        // nm_ + nn_ notifications, because they will not receive notifications
+        // from preceeding kernels.
+        state_switch_[x] =
+            x == 0
+                ? 1
+                : (parallel_pack_ ? nn_ + nm_ : (shard_by_col_ ? nn_ : nm_)) +
+                      (x == P - 1 ? nm_ * nn_ : 0);
+        state_packing_ready_[x] =
+            parallel_pack_ ? 0 : (shard_by_col_ ? nm_ : nn_);
+        state_kernel_[x] = new std::atomic<uint8_t>*[nm_];
+        for (Index m = 0; m < nm_; m++) {
+          state_kernel_[x][m] = new std::atomic<uint8_t>[nn_];
+          // Kernels generally receive 3 notifications (previous kernel + 2
+          // packing), but the first slice won't get notifications from previous
+          // kernels.
+          for (Index n = 0; n < nn_; n++)
+            state_kernel_[x][m][n].store(
+                (x == 0 ? 0 : 1) + (parallel_pack_ ? 2 : 1),
+                std::memory_order_relaxed);
+        }
+      }
+
+      // Allocate memory for packed rhs/lhs matrices.
+      size_t align = numext::maxi(EIGEN_MAX_ALIGN_BYTES, 1);
+      size_t lhs_size =
+          divup<size_t>(bm_ * bk_ * sizeof(LhsScalar), align) * align;
+      size_t rhs_size =
+          divup<size_t>(bn_ * bk_ * sizeof(RhsScalar), align) * align;
+      packed_mem_ = static_cast<char*>(internal::aligned_malloc(
+          (nm0_ * lhs_size + nn0_ * rhs_size) * std::min<size_t>(nk_, P - 1)));
+      char* mem = static_cast<char*>(packed_mem_);
+      for (Index x = 0; x < numext::mini<Index>(nk_, P - 1); x++) {
+        packed_lhs_[x].resize(nm0_);
+        for (Index m = 0; m < nm0_; m++) {
+          packed_lhs_[x][m] = reinterpret_cast<LhsScalar*>(mem);
+          mem += lhs_size;
+        }
+        packed_rhs_[x].resize(nn0_);
+        for (Index n = 0; n < nn0_; n++) {
+          packed_rhs_[x][n] = reinterpret_cast<RhsScalar*>(mem);
+          mem += rhs_size;
+        }
+      }
+    }
+
+    ~Context() {
+      for (Index x = 0; x < P; x++) {
+        for (Index m = 0; m < nm_; m++) delete[] state_kernel_[x][m];
+        delete[] state_kernel_[x];
+      }
+      internal::aligned_free(packed_mem_);
+    }
+
+    void run() {
+      // Kick off packing of the first slice.
+      signal_switch(0, 1);
+      // Wait for overall completion.
+      // TODO(dvyukov): this wait can lead to deadlock.
+      // If nthreads contractions are concurrently submitted from worker
+      // threads, this wait will block all worker threads and the system will
+      // deadlock.
+      done_.Wait();
+    }
+
+   private:
+    Notification done_;
+    const Device& device_;
+    LhsMapper& lhs_;
+    RhsMapper& rhs_;
+    Scalar* const buffer_;
+    OutputMapper output_;
+    const int num_threads_;
+    const bool shard_by_col_;
+    const bool parallel_pack_;
+    // Matrix sizes.
+    const Index m_;
+    const Index n_;
+    const Index k_;
+    // Block sizes.
+    const Index bm_;
+    const Index bn_;
+    const Index bk_;
+    // Number of tasks.
+    const Index nm_;
+    const Index nn_;
+    const Index nk_;
+    // Task grain sizes (number of kernels executed per task).
+    const Index gm_;
+    const Index gn_;
+    // Number of blocks (this is different from ni_/nn_ because of task size
+    // coarsening).
+    const Index nm0_;
+    const Index nn0_;
+
+    // Parallelization strategy.
+    //
+    // Blocks related to the same k block can run in parallel because they write
+    // to different output blocks. So we parallelize within k slices, this
+    // gives us parallelism level of m x n. Before we can start any kernels
+    // related to k-th slice, we need to issue m lhs packing tasks and n rhs
+    // packing tasks.
+    //
+    // However, there is a bottleneck when we are finishing kernels for k-th
+    // slice (at the very end there is only 1 runnable kernel). To mitigate this
+    // bottleneck we allow kernels from k-th and k+1-th slices to run in
+    // parallel. Note that (m, n, k) and (m, n, k+1) kernels write to the same
+    // output block, so they must not run in parallel.
+    //
+    // This gives us the following dependency graph.
+    // On each k slice we have m x n kernel tasks, m lhs paking tasks and n rhs
+    // packing tasks.
+    // Kernel (m, n, k) can start when:
+    //  - kernel (m, n, k-1) has finished
+    //  - lhs packing (m, k) has finished
+    //  - rhs packing (n, k) has finished
+    // Lhs/rhs packing can start when:
+    //  - all k-1 packing has finished (artificially imposed to limit amount of
+    //  parallel packing)
+    //
+    // On top of that we limit runnable tasks to two consecutive k slices.
+    // This is done to limit amount of memory we need for packed lhs/rhs
+    // (for each k slice we need m*bk + n*bk memory in packed_lhs_/packed_rhs_).
+    //
+    // state_switch_ tracks when we are ready to switch to the next k slice.
+    // state_kernel_[m][n] tracks when we are ready to kick off kernel (m, n).
+    // These variable are rolling over 3 consecutive k slices: first two we are
+    // actively executing + one to track completion of kernels in the second
+    // slice.
+    static const Index P = 3;
+    void* packed_mem_;
+    std::vector<LhsScalar*> packed_lhs_[P - 1];
+    std::vector<RhsScalar*> packed_rhs_[P - 1];
+    std::atomic<uint8_t>** state_kernel_[P];
+    // state_switch_ is frequently modified by worker threads, while other
+    // fields are read-only after constructor. Let's move it to a separate cache
+    // line to reduce cache-coherency traffic.
+    char pad_[128];
+    std::atomic<Index> state_packing_ready_[P];
+    std::atomic<Index> state_switch_[P];
+
+    void pack_lhs(Index m, Index k) {
+      const Index mend = m * gm_ + gm(m);
+      for (Index m1 = m * gm_; m1 < mend; m1++)
+        LhsPacker()(packed_lhs_[k % (P - 1)][m1],
+                    lhs_.getSubMapper(m1 * bm_, k * bk_), bk(k), bm(m1));
+
+      if (!parallel_pack_ && shard_by_col_) {
+        signal_packing(k);
+      } else {
+        signal_switch(k + 1);
+        for (Index n = nn_ - 1; n >= 0; n--) signal_kernel(m, n, k, n == 0);
+      }
+    }
+
+    void pack_rhs(Index n, Index k) {
+      const Index nend = n * gn_ + gn(n);
+      for (Index n1 = n * gn_; n1 < nend; n1++) {
+        if (k == 0) {
+          // Zero the output memory in parallel.
+          // On 10000x2x10000 mm zeroing can easily take half of time.
+          // Zero (bn x m) row. Safe to do here because all kernels that will
+          // write to this memory depend on completion of this task.
+          // Note: don't call device_.memset() here. device_.memset() blocks on
+          // thread pool worker thread, which can lead to underutilization and
+          // deadlocks.
+          memset(buffer_ + n1 * bn_ * m_, 0, bn(n1) * m_ * sizeof(Scalar));
+        }
+        RhsPacker()(packed_rhs_[k % (P - 1)][n1],
+                    rhs_.getSubMapper(k * bk_, n1 * bn_), bk(k), bn(n1));
+      }
+
+      if (parallel_pack_ || shard_by_col_) {
+        signal_switch(k + 1);
+        for (Index m = nm_ - 1; m >= 0; m--) signal_kernel(m, n, k, m == 0);
+      } else {
+        signal_packing(k);
+      }
+    }
+
+    void kernel(Index m, Index n, Index k) {
+      // Note: order of iteration matters here. Iteration over m is innermost
+      // because we want to reuse the same packed rhs in consequetive tasks
+      // (rhs fits into L2$ while lhs only into L3$).
+      const Index nend = n * gn_ + gn(n);
+      const Index mend = m * gm_ + gm(m);
+      if (shard_by_col_) {
+        for (Index n1 = n * gn_; n1 < nend; n1++) {
+          for (Index m1 = m * gm_; m1 < mend; m1++)
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
+        }
+      } else {
+        for (Index m1 = m * gm_; m1 < mend; m1++)
+          for (Index n1 = n * gn_; n1 < nend; n1++) {
+            GebpKernel()(output_.getSubMapper(m1 * bm_, n1 * bn_),
+                         packed_lhs_[k % (P - 1)][m1],
+                         packed_rhs_[k % (P - 1)][n1], bm(m1), bk(k), bn(n1),
+                         Scalar(1), -1, -1, 0, 0);
+          }
+      }
+      signal_kernel(m, n, k + 1, false);
+      signal_switch(k + 2);
+    }
+
+    void signal_packing(Index k) {
+      eigen_assert(!parallel_pack_);
+      Index s = state_packing_ready_[k % P].fetch_sub(1);
+      eigen_assert(s > 0);
+      if (s != 1) return;
+      state_packing_ready_[k % P] = shard_by_col_ ? nm_ : nn_;
+      enqueue_packing(k, shard_by_col_);
+    }
+
+    void signal_kernel(Index m, Index n, Index k, bool sync) {
+      std::atomic<uint8_t>* state = &state_kernel_[k % P][m][n];
+      Index s = state->load();
+      eigen_assert(s > 0);
+      if (s != 1 && state->fetch_sub(1) != 1) return;
+      state->store(parallel_pack_ ? 3 : 2, std::memory_order_relaxed);
+      if (sync)
+        kernel(m, n, k);
+      else
+        device_.enqueueNoNotification([=]() { kernel(m, n, k); });
+    }
+
+    void signal_switch(Index k, Index v = 1) {
+      Index s = state_switch_[k % P].fetch_sub(v);
+      eigen_assert(s >= v);
+      if (s != v) return;
+
+      // Ready to switch to the next k slice.
+      // Reset counter for the next iteration.
+      state_switch_[k % P] =
+          (parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_)) +
+          nm_ * nn_;
+      if (k < nk_) {
+        // Issue lhs/rhs packing. Their completion will in turn kick off
+        // kernels.
+        if (parallel_pack_) {
+          enqueue_packing(k, !shard_by_col_);
+          enqueue_packing(k, shard_by_col_);
+        } else if (shard_by_col_) {
+          enqueue_packing(k, false);
+        } else {
+          enqueue_packing(k, true);
+        }
+
+        // Termination handling.
+        // Because kernel completion signals k + 2 switch, we need to finish nk
+        // + 2 slices without issuing any tasks on nk + 1 slice. So here we
+        // pretend that all nk + 1 packing tasks just finish instantly; so that
+        // nk + 2 switch only waits for completion of nk kernels.
+      } else if (k == nk_) {
+        signal_switch(k + 1,
+                      parallel_pack_ ? nm_ + nn_ : (shard_by_col_ ? nn_ : nm_));
+      } else {
+        done_.Notify();
+      }
+    }
+
+    // Enqueue all rhs/lhs packing for k-th slice.
+    void enqueue_packing(Index k, bool rhs) {
+      enqueue_packing_helper(0, rhs ? nn_ : nm_, k, rhs);
+    }
+
+    void enqueue_packing_helper(Index start, Index end, Index k, bool rhs) {
+      if (end - start == 1) {
+        if (rhs)
+          pack_rhs(start, k);
+        else
+          pack_lhs(start, k);
+      } else {
+        Index mid = (start + end) / 2;
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(mid, end, k, rhs); });
+        device_.enqueueNoNotification(
+            [=]() { enqueue_packing_helper(start, mid, k, rhs); });
+      }
+    }
+
+    // Block sizes with accounting for potentially incomplete last block.
+    Index bm(Index m) const { return m + 1 < nm0_ ? bm_ : m_ + bm_ - bm_ * nm0_; }
+    Index bn(Index n) const { return n + 1 < nn0_ ? bn_ : n_ + bn_ - bn_ * nn0_; }
+    Index bk(Index k) const { return k + 1 < nk_ ? bk_ : k_ + bk_ - bk_ * nk_; }
+    // Task grain sizes accounting for potentially incomplete last task.
+    Index gm(Index m) const { return m + 1 < nm_ ? gm_ : nm0_ + gm_ - gm_ * nm_; }
+    Index gn(Index n) const { return n + 1 < nn_ ? gn_ : nn0_ + gn_ - gn_ * nn_; }
+
+    Context(const Context&) = delete;
+    void operator=(const Context&) = delete;
+  };
+
+  // Decide whether we want to shard m x n contraction by columns or by rows.
+  static bool shardByCol(Index m, Index n, Index num_threads) {
+    // Note: we are comparing both n and m against Traits::nr, it is not
+    // a mistake. We are trying to figure out how both n and m will fit into
+    // the main sharding dimension.
+
+    // Sharding by column is the default
+    // ... unless there is enough data for vectorization over rows
+    if (m / num_threads >= Traits::nr &&
+        // and not enough data for vectorization over columns
+        (n / num_threads < Traits::nr ||
+         // ... or barely enough data for vectorization over columns,
+         // but it is not evenly dividable across threads
+         (n / num_threads < 4 * Traits::nr &&
+          (n % (num_threads * Traits::nr)) != 0 &&
+          // ... and it is evenly dividable across threads for rows
+          ((m % (num_threads * Traits::nr)) == 0 ||
+           // .. or it is not evenly dividable for both dimensions but
+           // there is much more data over rows so that corner effects are
+           // mitigated.
+           (m / n >= 6)))))
+      return false;
+    // Wait, or if matrices are just substantially prolonged over the other
+    // dimension.
+    if (n / num_threads < 16 * Traits::nr && m > n * 32) return false;
+    return true;
+  }
+
+  Index coarsenM(Index m, Index n, Index bm, Index bn, Index bk, Index gn,
+                 int num_threads, bool shard_by_col) const {
+    Index gm = 1;
+    Index gm1 = 1;
+    Index nm0 = divup(m, bm);
+    Index nm1 = nm0;
+    for (;;) {
+      // Find the next candidate for m grain size. It needs to result in
+      // different number of blocks. E.g. if we have 10 kernels, we want to try
+      // 5 and 10, but not 6, 7, 8 and 9.
+      while (gm1 <= nm0 && nm1 == divup(nm0, gm1)) gm1++;
+      if (gm1 > nm0) break;
+      // Check the candidate.
+      int res = checkGrain(m, n, bm, bn, bk, gm1, gn, gm, gn, num_threads,
+                           shard_by_col);
+      if (res < 0) break;
+      nm1 = divup(nm0, gm1);
+      if (res == 0) continue;
+      // Commit new grain size.
+      gm = gm1;
+    }
+    return gm;
+  }
+
+  Index coarsenN(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+                 int num_threads, bool shard_by_col) const {
+    Index gn = 1;
+    Index gn1 = 1;
+    Index nn0 = divup(n, bn);
+    Index nn1 = nn0;
+    for (;;) {
+      while (gn1 <= nn0 && nn1 == divup(nn0, gn1)) gn1++;
+      if (gn1 > nn0) break;
+      int res = checkGrain(m, n, bm, bn, bk, gm, gn1, gm, gn, num_threads,
+                           shard_by_col);
+      if (res < 0) break;
+      nn1 = divup(nn0, gn1);
+      if (res == 0) continue;
+      gn = gn1;
+    }
+    return gn;
+  }
+
+  // checkGrain checks whether grain (gm, gn) is suitable and is better than
+  // (oldgm, oldgn).
+  int checkGrain(Index m, Index n, Index bm, Index bn, Index bk, Index gm,
+                 Index gn, Index oldgm, Index oldgn, int num_threads,
+                 bool shard_by_col) const {
+    const TensorOpCost cost =
+        contractionCost(bm * gm, bn * gn, bm, bn, bk, shard_by_col, true);
+    double taskSize = TensorCostModel<ThreadPoolDevice>::taskSize(
+        static_cast<double>(bm) * gm * bn * gn, cost);
+    // If the task is too small, then we agree on it regardless of anything
+    // else. Otherwise synchronization overheads will dominate.
+    if (taskSize < 1) return 1;
+    // If it is too large, then we reject it and all larger tasks.
+    if (taskSize > 2) return -1;
+    // Now we are in presumably good task size range.
+    // The main deciding factor here is parallelism. Consider that we have 12
+    // kernels and 4 threads. Grains of 2, 3 and 4 all yield good task sizes.
+    // But 2/4 yield 6/3 tasks, which gives us parallelism of 0.75 (at most 3/4
+    // of cores will be busy). While grain size 3 gives us 4 tasks, which gives
+    // us parallelism of 1 (we can load all cores).
+    Index nm0 = divup(m, bm);
+    Index nn0 = divup(n, bn);
+    Index new_tasks = divup(nm0, gm) * divup(nn0, gn);
+    double new_parallelism = static_cast<double>(new_tasks) /
+                             (divup<int>(new_tasks, num_threads) * num_threads);
+    Index old_tasks = divup(nm0, oldgm) * divup(nn0, oldgn);
+    double old_parallelism = static_cast<double>(old_tasks) /
+                             (divup<int>(old_tasks, num_threads) * num_threads);
+    if (new_parallelism > old_parallelism || new_parallelism == 1) return 1;
+    return 0;
+  }
+
+#else  // EIGEN_USE_SIMPLE_THREAD_POOL
+
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
   void evalProduct(Scalar* buffer) const {
     if (this->m_j_size == 1) {
@@ -376,7 +995,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
         const Index actual_mc = numext::mini(m_base_start + arg.mc, arg.max_m) - m_base_start;
         gebp(arg.output.getSubMapper(m_base_start, arg.n),
              (*arg.blockAs)[blockAId], arg.blockB,
-             actual_mc, arg.kc, arg.nc, 1.0, -1, -1, 0, 0);
+             actual_mc, arg.kc, arg.nc, Scalar(1), -1, -1, 0, 0);
 
         // Notify that the kernel is done.
         const Index set_idx = blockAId * arg.n_blocks + arg.n_block_idx;
@@ -384,6 +1003,47 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       }
     }
   }
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
+
+  TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
+                               bool shard_by_col, bool prepacked) const {
+    const int packed_size = std::min<int>(PacketType<LhsScalar, Device>::size,
+                                          PacketType<RhsScalar, Device>::size);
+    const int output_packet_size = internal::unpacket_traits<PacketReturnType>::size;
+    const double kd = static_cast<double>(bk);
+    // Peak VFMA bandwidth is 0.5. However if we have not enough data for
+    // vectorization bandwidth drops. The 4.0 and 2.0 bandwidth is determined
+    // experimentally.
+    double computeBandwidth = bk == 1 ? 4.0 :
+          (shard_by_col ? bn : bm) < Traits::nr ||
+          (shard_by_col ? bm : bn) < Traits::mr ? 2.0 : 0.5;
+#ifndef EIGEN_VECTORIZE_FMA
+    // Bandwidth of all of VFMA/MULPS/ADDPS is 0.5 on latest Intel processors.
+    // However for MULPS/ADDPS we have dependent sequence of 2 such instructions,
+    // so overall bandwidth is 1.0.
+    if (computeBandwidth == 0.5) computeBandwidth = 1.0;
+#endif
+    // Computations.
+    TensorOpCost cost = TensorOpCost(0, 0, kd * computeBandwidth, true, packed_size);
+    // Output stores.
+    cost += TensorOpCost(0, sizeof(CoeffReturnType), 0, true, output_packet_size);
+    if (prepacked) {
+      // Packing and kernels are executed in different tasks. When we calculate
+      // task grain size we look only at kernel cost assuming that kernel
+      // is more expensive than packing.
+      return cost;
+    }
+    // Lhs/rhs loads + computations.
+    TensorOpCost lhsCost = this->m_leftImpl.costPerCoeff(true) * (kd / n);
+    TensorOpCost rhsCost = this->m_rightImpl.costPerCoeff(true) * (kd / m);
+    // Lhs packing memory cost does not contribute considerably to overall
+    // execution time because lhs is prefetched early and accessed sequentially.
+    if (shard_by_col)
+      lhsCost.dropMemoryCost();
+    else
+      rhsCost.dropMemoryCost();
+    return cost + lhsCost + rhsCost;
+  }
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
index a2f1f71f5..860a6949a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConversion.h
@@ -164,14 +164,14 @@ class TensorConversionOp : public TensorBase<TensorConversionOp<TargetType, XprT
 };
 
 template <bool SameType, typename Eval, typename Scalar> struct ConversionSubExprEval {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar*) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar*) {
     impl.evalSubExprsIfNeeded(NULL);
     return true;
   }
 };
 
 template <typename Eval, typename Scalar> struct ConversionSubExprEval<true, Eval, Scalar> {
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static bool run(Eval& impl, Scalar* data) {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool run(Eval& impl, Scalar* data) {
     return impl.evalSubExprsIfNeeded(data);
   }
 };
@@ -193,7 +193,7 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
 
   enum {
     IsAligned = false,
-    PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess && internal::type_casting_traits<SrcType, TargetType>::VectorizedCast,
+    PacketAccess = true,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     RawAccess = false
   };
@@ -224,11 +224,9 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
-    const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
-    PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
-                    SrcCoeffRatio, TgtCoeffRatio> converter(m_impl);
-    return converter.template packet<LoadMode>(index);
+    const bool Vectorizable = TensorEvaluator<ArgType, Device>::PacketAccess &
+        internal::type_casting_traits<SrcType, TargetType>::VectorizedCast;
+    return PacketConv<LoadMode, Vectorizable>::run(m_impl, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -249,7 +247,31 @@ struct TensorEvaluator<const TensorConversionOp<TargetType, ArgType>, Device>
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
   protected:
-    TensorEvaluator<ArgType, Device> m_impl;
+  template <int LoadMode, bool ActuallyVectorize>
+  struct PacketConv {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      internal::scalar_cast_op<SrcType, TargetType> converter;
+      EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
+      for (int i = 0; i < PacketSize; ++i) {
+        values[i] = converter(impl.coeff(index+i));
+      }
+      PacketReturnType rslt = internal::pload<PacketReturnType>(values);
+      return rslt;
+    }
+  };
+
+  template <int LoadMode>
+  struct PacketConv<LoadMode, true> {
+    static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType run(const TensorEvaluator<ArgType, Device>& impl, Index index) {
+      const int SrcCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::SrcCoeffRatio;
+      const int TgtCoeffRatio = internal::type_casting_traits<SrcType, TargetType>::TgtCoeffRatio;
+      PacketConverter<TensorEvaluator<ArgType, Device>, PacketSourceType, PacketReturnType,
+                      SrcCoeffRatio, TgtCoeffRatio> converter(impl);
+      return converter.template packet<LoadMode>(index);
+    }
+  };
+
+  TensorEvaluator<ArgType, Device> m_impl;
 };
 
 } // end namespace Eigen
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 091007ab7..abdf742c6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -254,7 +254,7 @@ struct nested<TensorConvolutionOp<Dimensions, InputXprType, KernelXprType>, 1, t
 
 
 template<typename Indices, typename InputXprType, typename KernelXprType>
-class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType> >
+class TensorConvolutionOp : public TensorBase<TensorConvolutionOp<Indices, InputXprType, KernelXprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorConvolutionOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index 0f6dcedaa..83c449cf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -10,10 +10,6 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 #define EIGEN_CXX11_TENSOR_TENSOR_COST_MODEL_H
 
-//#if !defined(EIGEN_USE_GPU)
-//#define EIGEN_USE_COST_MODEL
-//#endif
-
 namespace Eigen {
 
 /** \class TensorEvaluator
@@ -32,45 +28,47 @@ class TensorOpCost {
   // model based on minimal reciprocal throughput numbers from Intel or
   // Agner Fog's tables would be better than what is there now.
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int MulCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int MulCost() {
     return internal::functor_traits<
         internal::scalar_product_op<ArgType, ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int AddCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int AddCost() {
     return internal::functor_traits<internal::scalar_sum_op<ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int DivCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int DivCost() {
     return internal::functor_traits<
         internal::scalar_quotient_op<ArgType, ArgType> >::Cost;
   }
   template <typename ArgType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int ModCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int ModCost() {
     return internal::functor_traits<internal::scalar_mod_op<ArgType> >::Cost;
   }
   template <typename SrcType, typename TargetType>
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static int CastCost() {
+  static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int CastCost() {
     return internal::functor_traits<
         internal::scalar_cast_op<SrcType, TargetType> >::Cost;
   }
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost() : bytes_loaded_(0), bytes_stored_(0), compute_cycles_(0) {}
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles)
       : bytes_loaded_(bytes_loaded),
         bytes_stored_(bytes_stored),
         compute_cycles_(compute_cycles) {}
 
+  EIGEN_DEVICE_FUNC
   TensorOpCost(double bytes_loaded, double bytes_stored, double compute_cycles,
                bool vectorized, double packet_size)
       : bytes_loaded_(bytes_loaded),
         bytes_stored_(bytes_stored),
         compute_cycles_(vectorized ? compute_cycles / packet_size
                                    : compute_cycles) {
-    using std::isfinite;
-    eigen_assert(bytes_loaded >= 0 && (isfinite)(bytes_loaded));
-    eigen_assert(bytes_stored >= 0 && (isfinite)(bytes_stored));
-    eigen_assert(compute_cycles >= 0 && (isfinite)(compute_cycles));
+    eigen_assert(bytes_loaded >= 0 && (numext::isfinite)(bytes_loaded));
+    eigen_assert(bytes_stored >= 0 && (numext::isfinite)(bytes_stored));
+    eigen_assert(compute_cycles >= 0 && (numext::isfinite)(compute_cycles));
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE double bytes_loaded() const {
@@ -96,21 +94,21 @@ class TensorOpCost {
   }
 
   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 1d2d162dc..4f5767bc7 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -12,6 +12,8 @@
 
 namespace Eigen {
 
+static const int kCudaScratchSize = 1024;
+
 // This defines an interface that GPUDevice can take to use
 // CUDA streams underneath.
 class StreamInterface {
@@ -24,6 +26,15 @@ class StreamInterface {
   // Allocate memory on the actual device where the computation will run
   virtual void* allocate(size_t num_bytes) const = 0;
   virtual void deallocate(void* buffer) const = 0;
+
+  // Return a scratchpad buffer of size 1k
+  virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
 };
 
 static cudaDeviceProp* m_deviceProperties;
@@ -31,7 +42,21 @@ static bool m_devicePropInitialized = false;
 
 static void initializeDeviceProp() {
   if (!m_devicePropInitialized) {
-    if (!m_devicePropInitialized) {
+    // Attempts to ensure proper behavior in the case of multiple threads
+    // calling this function simultaneously. This would be trivial to
+    // implement if we could use std::mutex, but unfortunately mutex don't
+    // compile with nvcc, so we resort to atomics and thread fences instead.
+    // Note that if the caller uses a compiler that doesn't support c++11 we
+    // can't ensure that the initialization is thread safe.
+#if __cplusplus >= 201103L
+    static std::atomic<bool> first(true);
+    if (first.exchange(false)) {
+#else
+    static bool first = true;
+    if (first) {
+      first = false;
+#endif
+      // We're the first thread to reach this point.
       int num_devices;
       cudaError_t status = cudaGetDeviceCount(&num_devices);
       if (status != cudaSuccess) {
@@ -52,7 +77,19 @@ static void initializeDeviceProp() {
           assert(status == cudaSuccess);
         }
       }
+
+#if __cplusplus >= 201103L
+      std::atomic_thread_fence(std::memory_order_release);
+#endif
       m_devicePropInitialized = true;
+    } else {
+      // Wait for the other thread to inititialize the properties.
+      while (!m_devicePropInitialized) {
+#if __cplusplus >= 201103L
+        std::atomic_thread_fence(std::memory_order_acquire);
+#endif
+        sleep(1);
+      }
     }
   }
 }
@@ -62,12 +99,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
 class CudaStreamDevice : public StreamInterface {
  public:
   // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream) {
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
     cudaGetDevice(&device_);
     initializeDeviceProp();
   }
   // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device) {
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     initializeDeviceProp();
   }
   // Use the specified stream. Note that it's the
@@ -75,7 +112,7 @@ class CudaStreamDevice : public StreamInterface {
   // the specified device. If no device is specified the code
   // assumes that the stream is associated to the current gpu device.
   CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device) {
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     if (device < 0) {
       cudaGetDevice(&device_);
     } else {
@@ -89,6 +126,12 @@ class CudaStreamDevice : public StreamInterface {
     initializeDeviceProp();
   }
 
+  virtual ~CudaStreamDevice() {
+    if (scratch_) {
+      deallocate(scratch_);
+    }
+  }
+
   const cudaStream_t& stream() const { return *stream_; }
   const cudaDeviceProp& deviceProperties() const {
     return m_deviceProperties[device_];
@@ -112,9 +155,29 @@ class CudaStreamDevice : public StreamInterface {
     assert(err == cudaSuccess);
   }
 
+  virtual void* scratchpad() const {
+    if (scratch_ == NULL) {
+      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
+    }
+    return scratch_;
+  }
+
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+    }
+    return semaphore_;
+  }
+
  private:
   const cudaStream_t* stream_;
   int device_;
+  mutable void* scratch_;
+  mutable unsigned int* semaphore_;
 };
 
 struct GpuDevice {
@@ -131,22 +194,20 @@ struct GpuDevice {
     return stream_->stream();
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return stream_->allocate(num_bytes);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return NULL;
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void deallocate(void* buffer) const {
     stream_->deallocate(buffer);
+  }
 
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
+  EIGEN_STRONG_INLINE void* scratchpad() const {
+    return stream_->scratchpad();
+  }
+
+  EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+    return stream_->semaphore();
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
@@ -156,30 +217,22 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyHostToDevice(void* dst, const void* src, size_t n) const {
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyHostToDevice, stream_->stream());
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE void memcpyDeviceToHost(void* dst, const void* src, size_t n) const {
     cudaError_t err =
         cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToHost, stream_->stream());
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-#endif
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memset(void* buffer, int c, size_t n) const {
@@ -188,21 +241,21 @@ struct GpuDevice {
     EIGEN_UNUSED_VARIABLE(err)
     assert(err == cudaSuccess);
 #else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
+  eigen_assert(false && "The default device should be used instead to generate kernel code");
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE size_t numThreads() const {
     // FIXME
     return 32;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t firstLevelCacheSize() const {
     // FIXME
     return 48*1024;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
+  EIGEN_STRONG_INLINE size_t lastLevelCacheSize() const {
     // We won't try to take advantage of the l2 cache for the time being, and
     // there is no l3 cache on cuda devices.
     return firstLevelCacheSize();
@@ -222,56 +275,26 @@ struct GpuDevice {
 #endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int getNumCudaMultiProcessors() const {
     return stream_->deviceProperties().multiProcessorCount;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerBlock() const {
     return stream_->deviceProperties().maxThreadsPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int maxCudaThreadsPerMultiProcessor() const {
     return stream_->deviceProperties().maxThreadsPerMultiProcessor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int sharedMemPerBlock() const {
     return stream_->deviceProperties().sharedMemPerBlock;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int majorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int majorDeviceVersion() const {
     return stream_->deviceProperties().major;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int minorDeviceVersion() const {
-#ifndef __CUDA_ARCH__
+  EIGEN_STRONG_INLINE int minorDeviceVersion() const {
     return stream_->deviceProperties().minor;
-#else
-    eigen_assert(false && "The default device should be used instead to generate kernel code");
-    return 0;
-#endif
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int maxBlocks() const {
+  EIGEN_STRONG_INLINE int maxBlocks() const {
     return max_blocks_;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index c02891465..069680a11 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -14,7 +14,7 @@ namespace Eigen {
 
 // Use the SimpleThreadPool by default. We'll switch to the new non blocking
 // thread pool later.
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
 template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
 typedef NonBlockingThreadPool ThreadPool;
 #else
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
     ::memset(buffer, c, n);
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE int numThreads() const {
     return num_threads_;
   }
 
@@ -151,9 +151,7 @@ struct ThreadPoolDevice {
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
     Notification* n = new Notification();
-    std::function<void()> func =
-      std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
     return n;
   }
 
@@ -161,20 +159,118 @@ struct ThreadPoolDevice {
   EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
                                                 Function&& f,
                                                 Args&&... args) const {
-    std::function<void()> func = std::bind(
-        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(
+        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
   }
 
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
-    std::function<void()> func = std::bind(f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(f, args...));
+  }
+
+  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+  // called from one of the threads in pool_. Returns -1 otherwise.
+  EIGEN_STRONG_INLINE int currentThreadId() const {
+    return pool_->CurrentThreadId();
+  }
+
+  // parallelFor executes f with [0, n) arguments in parallel and waits for
+  // completion. F accepts a half-open interval [first, last).
+  // Block size is choosen based on the iteration cost and resulting parallel
+  // efficiency. If block_align is not nullptr, it is called to round up the
+  // block size.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<Index(Index)> block_align,
+                   std::function<void(Index, Index)> f) const {
+    typedef TensorCostModel<ThreadPoolDevice> CostModel;
+    if (n <= 1 || numThreads() == 1 ||
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
+      f(0, n);
+      return;
+    }
+
+    // Calculate block size based on (1) the iteration cost and (2) parallel
+    // efficiency. We want blocks to be not too small to mitigate
+    // parallelization overheads; not too large to mitigate tail
+    // effect and potential load imbalance and we also want number
+    // of blocks to be evenly dividable across threads.
+
+    double block_size_f = 1.0 / CostModel::taskSize(1, cost);
+    Index block_size = numext::mini(n, numext::maxi<Index>(1, block_size_f));
+    const Index max_block_size =
+        numext::mini(n, numext::maxi<Index>(1, 2 * block_size_f));
+    if (block_align) {
+      Index new_block_size = block_align(block_size);
+      eigen_assert(new_block_size >= block_size);
+      block_size = numext::mini(n, new_block_size);
+    }
+    Index block_count = divup(n, block_size);
+    // Calculate parallel efficiency as fraction of total CPU time used for
+    // computations:
+    double max_efficiency =
+        static_cast<double>(block_count) /
+        (divup<int>(block_count, numThreads()) * numThreads());
+    // Now try to increase block size up to max_block_size as long as it
+    // doesn't decrease parallel efficiency.
+    for (Index prev_block_count = block_count; prev_block_count > 1;) {
+      // This is the next block size that divides size into a smaller number
+      // of blocks than the current block_size.
+      Index coarser_block_size = divup(n, prev_block_count - 1);
+      if (block_align) {
+        Index new_block_size = block_align(coarser_block_size);
+        eigen_assert(new_block_size >= coarser_block_size);
+        coarser_block_size = numext::mini(n, new_block_size);
+      }
+      if (coarser_block_size > max_block_size) {
+        break;  // Reached max block size. Stop.
+      }
+      // Recalculate parallel efficiency.
+      const Index coarser_block_count = divup(n, coarser_block_size);
+      eigen_assert(coarser_block_count < prev_block_count);
+      prev_block_count = coarser_block_count;
+      const double coarser_efficiency =
+          static_cast<double>(coarser_block_count) /
+          (divup<int>(coarser_block_count, numThreads()) * numThreads());
+      if (coarser_efficiency + 0.01 >= max_efficiency) {
+        // Taking it.
+        block_size = coarser_block_size;
+        block_count = coarser_block_count;
+        if (max_efficiency < coarser_efficiency) {
+          max_efficiency = coarser_efficiency;
+        }
+      }
+    }
+
+    // Recursively divide size into halves until we reach block_size.
+    // Division code rounds mid to block_size, so we are guaranteed to get
+    // block_count leaves that do actual computations.
+    Barrier barrier(static_cast<unsigned int>(block_count));
+    std::function<void(Index, Index)> handleRange;
+    handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
+      if (last - first <= block_size) {
+        // Single block or less, execute directly.
+        f(first, last);
+        barrier.Notify();
+        return;
+      }
+      // Split into halves and submit to the pool.
+      Index mid = first + divup((last - first) / 2, block_size) * block_size;
+      pool_->Schedule([=, &handleRange]() { handleRange(mid, last); });
+      pool_->Schedule([=, &handleRange]() { handleRange(first, mid); });
+    };
+    handleRange(0, n);
+    barrier.Wait();
+  }
+
+  // Convenience wrapper for parallelFor that does not align blocks.
+  void parallelFor(Index n, const TensorOpCost& cost,
+                   std::function<void(Index, Index)> f) const {
+    parallelFor(n, cost, nullptr, std::move(f));
   }
 
  private:
   ThreadPoolInterface* pool_;
-  size_t num_threads_;
+  int num_threads_;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
index ca9ac79df..1a30e45fb 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensionList.h
@@ -44,7 +44,7 @@ template<DenseIndex n, typename Index, std::size_t Rank> const Index array_get(c
 }
 
 
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
 template <typename Index, std::size_t Rank>
 struct index_known_statically_impl<DimensionList<Index, Rank> > {
   EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
index f0b8ac958..b24cdebf1 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDimensions.h
@@ -29,14 +29,6 @@ namespace Eigen {
   * \sa Tensor
   */
 
-// Can't use std::pair on cuda devices
-template <typename Index> struct IndexPair {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) { }
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Index f, Index s) : first(f), second(s) { }
-  Index first;
-  Index second;
-};
-
 // Boilerplate code
 namespace internal {
 
@@ -115,7 +107,7 @@ struct Sizes : internal::numeric_list<std::ptrdiff_t, Indices...> {
   explicit EIGEN_DEVICE_FUNC Sizes(const array<DenseIndex, Base::count>& /*indices*/) {
     // todo: add assertion
   }
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> EIGEN_DEVICE_FUNC Sizes(DenseIndex...) { }
   explicit EIGEN_DEVICE_FUNC Sizes(std::initializer_list<std::ptrdiff_t> /*l*/) {
     // todo: add assertion
@@ -182,7 +174,7 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
     return *this;
   }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   template <typename... DenseIndex> Sizes(DenseIndex... /*indices*/) { }
   explicit Sizes(std::initializer_list<std::size_t>) {
     // todo: add assertion
@@ -190,13 +182,13 @@ template <std::size_t V1=0, std::size_t V2=0, std::size_t V3=0, std::size_t V4=0
 #else
   EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
   }
-  EIGEN_DEVICE_FUNC explicit Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
+  EIGEN_DEVICE_FUNC Sizes(const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex, const DenseIndex) {
   }
 #endif
 
@@ -290,31 +282,31 @@ struct DSizes : array<DenseIndex, NumDims> {
     (*this)[0] = i0;
   }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   template<typename... IndexTypes> EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE explicit DSizes(DenseIndex firstDimension, DenseIndex secondDimension, IndexTypes... otherDimensions) : Base({{firstDimension, secondDimension, otherDimensions...}}) {
     EIGEN_STATIC_ASSERT(sizeof...(otherDimensions) + 2 == NumDims, YOU_MADE_A_PROGRAMMING_MISTAKE)
   }
 #else
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1) {
     eigen_assert(NumDims == 2);
     (*this)[0] = i0;
     (*this)[1] = i1;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2) {
     eigen_assert(NumDims == 3);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3) {
     eigen_assert(NumDims == 4);
     (*this)[0] = i0;
     (*this)[1] = i1;
     (*this)[2] = i2;
     (*this)[3] = i3;
   }
-  EIGEN_DEVICE_FUNC explicit DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
+  EIGEN_DEVICE_FUNC DSizes(const DenseIndex i0, const DenseIndex i1, const DenseIndex i2, const DenseIndex i3, const DenseIndex i4) {
     eigen_assert(NumDims == 5);
     (*this)[0] = i0;
     (*this)[1] = i1;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index c556fec0f..a08dfa7c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -56,7 +56,7 @@ struct nested<TensorEvalToOp<XprType>, 1, typename eval<TensorEvalToOp<XprType>
 
 
 template<typename XprType>
-class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType> >
+class TensorEvalToOp : public TensorBase<TensorEvalToOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorEvalToOp>::Scalar Scalar;
@@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = true,
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index ae4ce3c90..61c111cec 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -129,6 +129,10 @@ template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 double loadConstant(const double* address) {
   return __ldg(address);
 }
+template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+Eigen::half loadConstant(const Eigen::half* address) {
+  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
+}
 #endif
 }
 
@@ -222,7 +226,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
 
   typedef typename XprType::Index Index;
@@ -239,13 +243,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
-    return m_functor(index);
+    return m_wrapper(m_functor, index);
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    return m_functor.template packetOp<Index, PacketReturnType>(index);
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -259,6 +263,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
  private:
   const NullaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
 };
 
 
@@ -399,6 +404,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   TensorEvaluator<RightArgType, Device> m_rightImpl;
 };
 
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+                   internal::functor_traits<TernaryOp>::PacketAccess,
+    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_arg1Impl(op.arg1Expression(), device),
+      m_arg2Impl(op.arg2Expression(), device),
+      m_arg3Impl(op.arg3Expression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg2Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg3Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+
+    eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+    return m_arg1Impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_arg1Impl.evalSubExprsIfNeeded(NULL);
+    m_arg2Impl.evalSubExprsIfNeeded(NULL);
+    m_arg3Impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_arg1Impl.cleanup();
+    m_arg2Impl.cleanup();
+    m_arg3Impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+                              m_arg2Impl.template packet<LoadMode>(index),
+                              m_arg3Impl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+    return m_arg1Impl.costPerCoeff(vectorized) +
+           m_arg2Impl.costPerCoeff(vectorized) +
+           m_arg3Impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const TernaryOp m_functor;
+  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+  TensorEvaluator<Arg1Type, Device> m_arg2Impl;
+  TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
 
 // -------------------- SelectOp --------------------
 
@@ -475,7 +575,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
 
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index 5c3d4d630..0cac7b179 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -59,13 +59,14 @@ class TensorExecutor<Expression, DefaultDevice, true>
     {
       const Index size = array_prod(evaluator.dimensions());
       const int PacketSize = unpacket_traits<typename TensorEvaluator<Expression, DefaultDevice>::PacketReturnType>::size;
-      // Manually unroll this loop since compilers don't do it.
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
       const Index UnrolledSize = (size / (4 * PacketSize)) * 4 * PacketSize;
       for (Index i = 0; i < UnrolledSize; i += 4*PacketSize) {
-        evaluator.evalPacket(i);
-        evaluator.evalPacket(i+PacketSize);
-        evaluator.evalPacket(i+2*PacketSize);
-        evaluator.evalPacket(i+3*PacketSize);
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
       }
       const Index VectorizedSize = (size / PacketSize) * PacketSize;
       for (Index i = UnrolledSize; i < VectorizedSize; i += PacketSize) {
@@ -92,24 +93,30 @@ struct EvalRange {
       evaluator.evalScalar(i);
     }
   }
+
+  static Index alignBlockSize(Index size) {
+    return size;
+  }
 };
 
 template <typename Evaluator, typename Index>
 struct EvalRange<Evaluator, Index, true> {
+  static const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
+
   static void run(Evaluator* evaluator_in, const Index first, const Index last) {
     Evaluator evaluator = *evaluator_in;
     eigen_assert(last >= first);
     Index i = first;
-    const int PacketSize = unpacket_traits<typename Evaluator::PacketReturnType>::size;
     if (last - first >= PacketSize) {
       eigen_assert(first % PacketSize == 0);
       Index last_chunk_offset = last - 4 * PacketSize;
-      // Manually unroll this loop since compilers don't do it.
+      // Give the compiler a strong hint to unroll the loop. But don't insist
+      // on unrolling, because if the function is expensive the compiler should not
+      // unroll the loop at the expense of inlining.
       for (; i <= last_chunk_offset; i += 4*PacketSize) {
-        evaluator.evalPacket(i);
-        evaluator.evalPacket(i+PacketSize);
-        evaluator.evalPacket(i+2*PacketSize);
-        evaluator.evalPacket(i+3*PacketSize);
+        for (Index j = 0; j < 4; j++) {
+          evaluator.evalPacket(i + j * PacketSize);
+        }
       }
       last_chunk_offset = last - PacketSize;
       for (; i <= last_chunk_offset; i += PacketSize) {
@@ -120,6 +127,15 @@ struct EvalRange<Evaluator, Index, true> {
       evaluator.evalScalar(i);
     }
   }
+
+  static Index alignBlockSize(Index size) {
+    // Align block size to packet size and account for unrolling in run above.
+    if (size >= 16 * PacketSize) {
+      return (size + 4 * PacketSize - 1) & ~(4 * PacketSize - 1);
+    }
+    // Aligning to 4 * PacketSize would increase block size by more than 25%.
+    return (size + PacketSize - 1) & ~(PacketSize - 1);
+  }
 };
 
 template <typename Expression, bool Vectorizable>
@@ -133,18 +149,23 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
     const bool needs_assign = evaluator.evalSubExprsIfNeeded(NULL);
     if (needs_assign)
     {
-      const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
       const Index size = array_prod(evaluator.dimensions());
+#if !defined(EIGEN_USE_SIMPLE_THREAD_POOL)
+      device.parallelFor(size, evaluator.costPerCoeff(Vectorizable),
+                         EvalRange<Evaluator, Index, Vectorizable>::alignBlockSize,
+                         [&evaluator](Index first, Index last) {
+                           EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, first, last);
+                         });
+#else
       size_t num_threads = device.numThreads();
-#ifdef EIGEN_USE_COST_MODEL
       if (num_threads > 1) {
         num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
             size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
-#endif
       if (num_threads == 1) {
         EvalRange<Evaluator, Index, Vectorizable>::run(&evaluator, 0, size);
       } else {
+        const Index PacketSize = Vectorizable ? unpacket_traits<typename Evaluator::PacketReturnType>::size : 1;
         Index blocksz = std::ceil<Index>(static_cast<float>(size)/num_threads) + PacketSize - 1;
         const Index blocksize = numext::maxi<Index>(PacketSize, (blocksz - (blocksz % PacketSize)));
         const Index numblocks = size / blocksize;
@@ -161,11 +182,12 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
         }
         barrier.Wait();
       }
+#endif  // defined(!EIGEN_USE_SIMPLE_THREAD_POOL)
     }
     evaluator.cleanup();
   }
 };
-#endif
+#endif  // EIGEN_USE_THREADS
 
 
 // GPU: the evaluation of the expression is offloaded to a GPU.
@@ -212,16 +234,11 @@ struct EigenMetaKernelEval<Evaluator, Index, true> {
 template <typename Evaluator, typename Index>
 __global__ void
 __launch_bounds__(1024)
-EigenMetaKernel(Evaluator memcopied_eval, Index size) {
+EigenMetaKernel(Evaluator eval, Index size) {
 
   const Index first_index = blockIdx.x * blockDim.x + threadIdx.x;
   const Index step_size = blockDim.x * gridDim.x;
 
-  // Cuda memcopies the kernel arguments. That's fine for POD, but for more
-  // complex types such as evaluators we should really conform to the C++
-  // standard and call a proper copy constructor.
-  Evaluator eval(memcopied_eval);
-
   const bool vectorizable = Evaluator::PacketAccess & Evaluator::IsAligned;
   EigenMetaKernelEval<Evaluator, Index, vectorizable>::run(eval, first_index, size, step_size);
 }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index 8491c4ca2..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -219,6 +219,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX
 
 
 namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+  // Type promotion to handle the case where the types of the args are different.
+  typedef typename result_of<
+      TernaryOp(typename Arg1XprType::Scalar,
+                typename Arg2XprType::Scalar,
+                typename Arg3XprType::Scalar)>::type Scalar;
+  typedef traits<Arg1XprType> XprTraits;
+  typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+  typedef typename traits<Arg1XprType>::Index Index;
+  typedef typename Arg1XprType::Nested Arg1Nested;
+  typedef typename Arg2XprType::Nested Arg2Nested;
+  typedef typename Arg3XprType::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+        : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const TernaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    arg1Expression() const { return m_arg1_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    arg2Expression() const { return m_arg2_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+    arg3Expression() const { return m_arg3_xpr; }
+
+  protected:
+    typename Arg1XprType::Nested m_arg1_xpr;
+    typename Arg1XprType::Nested m_arg2_xpr;
+    typename Arg3XprType::Nested m_arg3_xpr;
+    const TernaryOp m_functor;
+};
+
+
+namespace internal {
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
 struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
     : traits<ThenXprType>
@@ -252,7 +332,7 @@ struct nested<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, 1, typename e
 
 
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
-class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
+class TensorSelectOp : public TensorBase<TensorSelectOp<IfXprType, ThenXprType, ElseXprType>, ReadOnlyAccessors>
 {
   public:
     typedef typename Eigen::internal::traits<TensorSelectOp>::Scalar Scalar;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index ece2ed91b..08eb5595a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
     for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
-        a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+        a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
       }
       else {
         a[i] = data[i] * pos_j_base_powered[i];
@@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         b[i] = pos_j_base_powered[i];
       }
       else {
-        b[i] = std::conj(pos_j_base_powered[i]);
+        b[i] = numext::conj(pos_j_base_powered[i]);
       }
     }
     for (Index i = n; i < m - n; ++i) {
@@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         b[i] = pos_j_base_powered[m-i];
       }
       else {
-        b[i] = std::conj(pos_j_base_powered[m-i]);
+        b[i] = numext::conj(pos_j_base_powered[m-i]);
       }
     }
 
@@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
     for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
-        data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+        data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
       }
       else {
         data[i] = a[i] * pos_j_base_powered[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
index b27ee0084..fcee5f60d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFixedSize.h
@@ -65,7 +65,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     inline Self& base()             { return *this; }
     inline const Self& base() const { return *this; }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& coeff(Index firstIndex, IndexTypes... otherIndices) const
     {
@@ -97,7 +97,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     }
 
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& coeffRef(Index firstIndex, IndexTypes... otherIndices)
     {
@@ -128,7 +128,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return m_storage.data()[0];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
@@ -213,7 +213,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
       return coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes>
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, IndexTypes... otherIndices)
     {
@@ -309,7 +309,7 @@ class TensorFixedSize : public TensorBase<TensorFixedSize<Scalar_, Dimensions_,
     {
     }
 
-#ifdef EIGEN_HAVE_RVALUE_REFERENCES
+#if EIGEN_HAS_RVALUE_REFERENCES
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorFixedSize(Self&& other)
       : m_storage(other.m_storage)
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 7ec757519..c23ecdbc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -55,7 +55,7 @@ struct nested<TensorForcedEvalOp<XprType>, 1, typename eval<TensorForcedEvalOp<X
 
 
 template<typename XprType>
-class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType> >
+class TensorForcedEvalOp : public TensorBase<TensorForcedEvalOp<XprType>, ReadOnlyAccessors>
 {
   public:
   typedef typename Eigen::internal::traits<TensorForcedEvalOp>::Scalar Scalar;
@@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
-    const Index numValues = m_impl.dimensions().TotalSize();
+    const Index numValues =  internal::array_prod(m_impl.dimensions());
     m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
     // Should initialize the memory in case we're dealing with non POD types.
     if (NumTraits<CoeffReturnType>::RequireInitialization) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a8bd8b888..490ddd8bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
 template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
 template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+template<typename Derived, int AccessLevel> class TensorBase;
 
 template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
 template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
 template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
 template<typename XprType> class TensorIndexTupleOp;
@@ -42,9 +43,11 @@ template<typename ReverseDimensions, typename XprType> class TensorReverseOp;
 template<typename PaddingDimensions, typename XprType> class TensorPaddingOp;
 template<typename Shuffle, typename XprType> class TensorShufflingOp;
 template<typename Strides, typename XprType> class TensorStridingOp;
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType> class TensorStridingSlicingOp;
 template<typename Strides, typename XprType> class TensorInflationOp;
 template<typename Generator, typename XprType> class TensorGeneratorOp;
 template<typename LeftXprType, typename RightXprType> class TensorAssignOp;
+template<typename Op, typename XprType> class TensorScanOp;
 
 template<typename CustomUnaryFunc, typename XprType> class TensorCustomUnaryOp;
 template<typename CustomBinaryFunc, typename LhsXprType, typename RhsXprType> class TensorCustomBinaryOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 33cd00391..7164e8d60 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,7 @@ struct scalar_mod_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 
 /** \internal
@@ -38,7 +38,7 @@ struct scalar_mod2_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 template <typename Scalar>
 struct scalar_fmod_op {
@@ -69,7 +69,7 @@ struct scalar_sigmoid_op {
 
   template <typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   Packet packetOp(const Packet& x) const {
-    const Packet one = pset1<Packet>(1);
+    const Packet one = pset1<Packet>(T(1));
     return pdiv(one, padd(one, pexp(pnegate(x))));
   }
 };
@@ -84,14 +84,23 @@ struct functor_traits<scalar_sigmoid_op<T> > {
 };
 
 
+template<typename Reducer, typename Device>
+struct reducer_traits {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasAdd;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
@@ -119,16 +128,26 @@ template <typename T> struct SumReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
 template <typename T> struct MeanReducer
 {
-  static const bool PacketAccess = !NumTraits<T>::IsInteger;
+  static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
   static const bool IsStateful = true;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   MeanReducer() : scalarCount_(0), packetCount_(0) { }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) {
-    (*accum) += t;
+    internal::scalar_sum_op<T> sum_op;
+    *accum = sum_op(*accum, t);
     scalarCount_++;
   }
   template <typename Packet>
@@ -162,9 +181,44 @@ template <typename T> struct MeanReducer
     DenseIndex packetCount_;
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return -Eigen::NumTraits<T>::infinity();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE T bottom_value() {
+    return Eigen::NumTraits<T>::infinity();
+  }
+};
+
+
 template <typename T> struct MaxReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasMax;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -174,9 +228,8 @@ template <typename T> struct MaxReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmax<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::lowest();
+    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -195,9 +248,18 @@ template <typename T> struct MaxReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMax
+  };
+};
+
+
 template <typename T> struct MinReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasMin;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -207,9 +269,8 @@ template <typename T> struct MinReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmin<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::highest();
+    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -228,10 +289,18 @@ template <typename T> struct MinReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMin
+  };
+};
+
 
 template <typename T> struct ProdReducer
 {
-  static const bool PacketAccess = true;
+  static const bool PacketAccess = packet_traits<T>::HasMul;
   static const bool IsStateful = false;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const T t, T* accum) const {
@@ -263,6 +332,14 @@ template <typename T> struct ProdReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::HasMul
+  };
+};
+
 
 struct AndReducer
 {
@@ -280,6 +357,15 @@ struct AndReducer
   }
 };
 
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 struct OrReducer {
   static const bool PacketAccess = false;
   static const bool IsStateful = false;
@@ -295,6 +381,15 @@ struct OrReducer {
   }
 };
 
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
@@ -312,6 +407,15 @@ template <typename T> struct ArgMaxTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
 template <typename T> struct ArgMinTupleReducer
 {
   static const bool PacketAccess = false;
@@ -328,457 +432,11 @@ template <typename T> struct ArgMinTupleReducer
   }
 };
 
-
-// Random number generation
-namespace {
-#ifdef __CUDA_ARCH__
-__device__ int get_random_seed() {
-    return clock();
-}
-#else
-int get_random_seed() {
-#ifdef _WIN32
-    SYSTEMTIME st;
-    GetSystemTime(&st);
-    return st.wSecond + 1000 * st.wMilliseconds;
-#elif defined __APPLE__
-    return static_cast<int>(mach_absolute_time());
-#else
-    timespec ts;
-    clock_gettime(CLOCK_REALTIME, &ts);
-    return static_cast<int>(ts.tv_nsec);
-#endif
-}
-#endif
-}
-
-#if !defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)
-// We're not compiling a cuda kernel
-template <typename T> class UniformRandomGenerator {
-
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    if (!deterministic) {
-      srand(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-  }
-
-  template<typename Index>
-  T operator()(Index) const {
-    return random<T>();
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = random<T>();
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  bool m_deterministic;
-};
-
-#if __cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  template<typename Index>
-  float operator()(Index) const {
-    return m_distribution(*m_generator);
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX float values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<float> m_distribution;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
-    m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
-    m_deterministic = other.m_deterministic;
-  }
-  ~UniformRandomGenerator() {
-    delete m_generator;
-  }
-
-  template<typename Index>
-  double operator()(Index) const {
-    return m_distribution(*m_generator);
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX double values[packetSize];
-    for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  UniformRandomGenerator& operator = (const UniformRandomGenerator&);
-  // Make sure m_deterministic comes first to match the layout of the cpu
-  // version of the code.
-  bool m_deterministic;
-  std::mt19937* m_generator;
-  mutable std::uniform_real_distribution<double> m_distribution;
-};
-#endif
-
-#else
-
-// We're compiling a cuda kernel
-template <typename T> class UniformRandomGenerator;
-
-template <> class UniformRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-     curand_init(seed, tid, 0, &m_state);
-  }
-
-  template<typename Index>
-  __device__ float operator()(Index) const {
-    return curand_uniform(&m_state);
-  }
-  template<typename Index, typename PacketType>
-  __device__ float4 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ double operator()(Index) const {
-    return curand_uniform_double(&m_state);
-  }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_uniform2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
-    float4 vals = curand_uniform4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class UniformRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ UniformRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ UniformRandomGenerator(const UniformRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
-    double2 vals = curand_uniform2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<UniformRandomGenerator<Scalar> > {
-  enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
-  };
-};
-
-
-
-#if (!defined (EIGEN_USE_GPU) || !defined(__CUDACC__) || !defined(__CUDA_ARCH__)) && (__cplusplus > 199711 || EIGEN_COMP_MSVC >= 1900)
-// We're not compiling a cuda kernel
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = true;
-
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic), m_distribution(0, 1), m_generator(new std::mt19937()) {
-    if (!deterministic) {
-      m_generator->seed(get_random_seed());
-    }
-  }
-  NormalRandomGenerator(const NormalRandomGenerator& other)
-      : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
-    m_generator->seed(other(0) * UINT_MAX);
-  }
-  ~NormalRandomGenerator() {
-    delete m_generator;
-  }
-  template<typename Index>
-  T operator()(Index) const {
-    return m_distribution(*m_generator);
-  }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
-    const int packetSize = internal::unpacket_traits<PacketType>::size;
-    EIGEN_ALIGN_MAX T values[packetSize];
-    for (int i = 0; i < packetSize; ++i) {
-      values[i] = m_distribution(*m_generator);
-    }
-    return internal::pload<PacketType>(values);
-  }
-
- private:
-  // No assignment
-  NormalRandomGenerator& operator = (const NormalRandomGenerator&);
-
-  bool m_deterministic;
-  mutable std::normal_distribution<T> m_distribution;
-  std::mt19937* m_generator;
-};
-
-#elif defined (EIGEN_USE_GPU) && defined(__CUDACC__) && defined(__CUDA_ARCH__)
-
-// We're compiling a cuda kernel
-template <typename T> class NormalRandomGenerator;
-
-template <> class NormalRandomGenerator<float> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<float>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ float operator()(Index) const {
-    return curand_normal(&m_state);
-  }
-  template<typename Index, typename PacketType>
-   __device__ float4 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal4(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<double> {
- public:
-  static const bool PacketAccess = true;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator<double>& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ double operator()(Index) const {
-    return curand_normal_double(&m_state);
-  }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
-    EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return curand_normal2_double(&m_state);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<float> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
-    float4 vals = curand_normal4(&m_state);
-    return std::complex<float>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-template <> class NormalRandomGenerator<std::complex<double> > {
- public:
-  static const bool PacketAccess = false;
-
-  __device__ NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  __device__ NormalRandomGenerator(const NormalRandomGenerator& other) {
-    m_deterministic = other.m_deterministic;
-    const int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    const int seed = m_deterministic ? 0 : get_random_seed();
-    curand_init(seed, tid, 0, &m_state);
-  }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
-    double2 vals = curand_normal2_double(&m_state);
-    return std::complex<double>(vals.x, vals.y);
-  }
-
- private:
-  bool m_deterministic;
-  mutable curandStatePhilox4_32_10_t m_state;
-};
-
-#else
-
-template <typename T> class NormalRandomGenerator {
- public:
-  static const bool PacketAccess = false;
-  NormalRandomGenerator(bool deterministic = true) : m_deterministic(deterministic) {}
-
- private:
-  bool m_deterministic;
-};
-
-#endif
-
-template <typename Scalar>
-struct functor_traits<NormalRandomGenerator<Scalar> > {
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
   enum {
-    // Rough estimate.
-    Cost = 100 * NumTraits<Scalar>::MulCost,
-    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
   };
 };
 
@@ -797,7 +455,7 @@ class GaussianGenerator {
     }
   }
 
-  T operator()(const array<Index, NumDims>& coordinates) const {
+  EIGEN_DEVICE_FUNC T operator()(const array<Index, NumDims>& coordinates) const {
     T tmp = T(0);
     for (size_t i = 0; i < NumDims; ++i) {
       T offset = coordinates[i] - m_means[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
index 8ff7d5815..eb1d4934e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGenerator.h
@@ -134,7 +134,7 @@ struct TensorEvaluator<const TensorGeneratorOp<Generator, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[packetSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 000000000..665b861cf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+                         const ADerived, const BDerived, const XDerived>
+    betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+  return TensorCwiseTernaryOp<
+      internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+      const BDerived, const XDerived>(
+      a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 38a833f82..a901c5dd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -13,38 +13,61 @@
 namespace Eigen {
 
 namespace internal {
-template<>
-struct significant_decimals_impl<std::string>
-    : significant_decimals_default_impl<std::string, true>
-{};
-}
 
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+      static const int layout = Tensor::Layout;
+      Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+      os << matrix;
+    }
+  }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+      os << array;
+    }
+  }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    os << tensor.coeff(0);
+  }
+};
+}
 
 template <typename T>
 std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+  typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+  typedef typename Evaluator::Dimensions Dimensions;
+
   // Evaluate the expression if needed
   TensorForcedEvalOp<const T> eval = expr.eval();
-  TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+  Evaluator tensor(eval, DefaultDevice());
   tensor.evalSubExprsIfNeeded(NULL);
 
-  typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
-  typedef typename T::Index Index;
-  typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
-  const Index total_size = internal::array_prod(tensor.dimensions());
-
-  // Print the tensor as a 1d vector or a 2d matrix.
+  // Print the result
   static const int rank = internal::array_size<Dimensions>::value;
-  if (rank == 0) {
-    os << tensor.coeff(0);
-  } else if (rank == 1) {
-    Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
-    os << array;
-  } else {
-    const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
-    static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
-    Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
-    os << matrix;
-  }
+  internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
 
   // Cleanup.
   tensor.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
index bafcc67bd..566856ed2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorImagePatch.h
@@ -174,7 +174,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
-    EIGEN_STATIC_ASSERT(NumDims >= 4, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims >= 4), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     m_paddingValue = op.padding_value();
 
@@ -362,7 +362,7 @@ struct TensorEvaluator<const TensorImagePatchOp<Rows, Cols, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
index 985594bc8..3209fecd3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIndexList.h
@@ -10,7 +10,8 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INDEX_LIST_H
 
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
 
 #define EIGEN_HAS_INDEX_LIST
 
@@ -45,6 +46,24 @@ struct type2index {
   }
 };
 
+// This can be used with IndexPairList to get compile-time constant pairs,
+// such as IndexPairList<type2indexpair<1,2>, type2indexpair<3,4>>().
+template <DenseIndex f, DenseIndex s>
+struct type2indexpair {
+  static const DenseIndex first = f;
+  static const DenseIndex second = s;
+
+  constexpr EIGEN_DEVICE_FUNC operator IndexPair<DenseIndex>() const {
+    return IndexPair<DenseIndex>(f, s);
+  }
+
+  EIGEN_DEVICE_FUNC void set(const IndexPair<DenseIndex>& val) {
+    eigen_assert(val.first == f);
+    eigen_assert(val.second == s);
+  }
+};
+
+
 template<DenseIndex n> struct NumTraits<type2index<n> >
 {
   typedef DenseIndex Real;
@@ -73,6 +92,16 @@ EIGEN_DEVICE_FUNC void update_value(type2index<n>& val, DenseIndex new_val) {
 }
 
 template <typename T>
+EIGEN_DEVICE_FUNC void update_value(T& val, IndexPair<DenseIndex> new_val) {
+  val = new_val;
+}
+template <DenseIndex f, DenseIndex s>
+EIGEN_DEVICE_FUNC void update_value(type2indexpair<f, s>& val, IndexPair<DenseIndex> new_val) {
+  val.set(new_val);
+}
+
+
+template <typename T>
 struct is_compile_time_constant {
   static constexpr bool value = false;
 };
@@ -94,7 +123,22 @@ struct is_compile_time_constant<const type2index<idx>& > {
   static constexpr bool value = true;
 };
 
-
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s> > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<type2indexpair<f, s>& > {
+  static constexpr bool value = true;
+};
+template <DenseIndex f, DenseIndex s>
+struct is_compile_time_constant<const type2indexpair<f, s>& > {
+  static constexpr bool value = true;
+};
 
 
 template<typename... T>
@@ -184,31 +228,32 @@ template <typename T, typename... O>
 
 
 
-template <DenseIndex Idx>
+template <DenseIndex Idx, typename ValueT>
 struct tuple_coeff {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
-    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex i, const IndexTuple<T...>& t) {
+    //    return array_get<Idx>(t) * (i == Idx) + tuple_coeff<Idx-1>::get(i, t) * (i != Idx);
+    return (i == Idx ? array_get<Idx>(t) : tuple_coeff<Idx-1, ValueT>::get(i, t));
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT& value) {
     if (i == Idx) {
       update_value(array_get<Idx>(t), value);
     } else {
-      tuple_coeff<Idx-1>::set(i, t, value);
+      tuple_coeff<Idx-1, ValueT>::set(i, t, value);
     }
   }
 
   template <typename... T>
   EIGEN_DEVICE_FUNC static constexpr bool value_known_statically(const DenseIndex i, const IndexTuple<T...>& t) {
     return ((i == Idx) & is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value) ||
-        tuple_coeff<Idx-1>::value_known_statically(i, t);
+        tuple_coeff<Idx-1, ValueT>::value_known_statically(i, t);
   }
 
   template <typename... T>
   EIGEN_DEVICE_FUNC static constexpr bool values_up_to_known_statically(const IndexTuple<T...>& t) {
     return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
-        tuple_coeff<Idx-1>::values_up_to_known_statically(t);
+        tuple_coeff<Idx-1, ValueT>::values_up_to_known_statically(t);
   }
 
   template <typename... T>
@@ -216,19 +261,19 @@ struct tuple_coeff {
     return is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
            is_compile_time_constant<typename IndexTupleExtractor<Idx, T...>::ValType>::value &&
            array_get<Idx>(t) > array_get<Idx-1>(t) &&
-           tuple_coeff<Idx-1>::values_up_to_statically_known_to_increase(t);
+           tuple_coeff<Idx-1, ValueT>::values_up_to_statically_known_to_increase(t);
   }
 };
 
-template <>
-struct tuple_coeff<0> {
+template <typename ValueT>
+struct tuple_coeff<0, ValueT> {
   template <typename... T>
-  EIGEN_DEVICE_FUNC static constexpr DenseIndex get(const DenseIndex i, const IndexTuple<T...>& t) {
+  EIGEN_DEVICE_FUNC static constexpr ValueT get(const DenseIndex /*i*/, const IndexTuple<T...>& t) {
     //  eigen_assert (i == 0);  // gcc fails to compile assertions in constexpr
-    return array_get<0>(t) * (i == 0);
+    return array_get<0>(t)/* * (i == 0)*/;
   }
   template <typename... T>
-  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const DenseIndex value) {
+  EIGEN_DEVICE_FUNC static void set(const DenseIndex i, IndexTuple<T...>& t, const ValueT value) {
     eigen_assert (i == 0);
     update_value(array_get<0>(t), value);
   }
@@ -254,13 +299,13 @@ struct tuple_coeff<0> {
 template<typename FirstType, typename... OtherTypes>
 struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex operator[] (const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
   }
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr DenseIndex get(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::get(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::get(i, *this);
   }
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const DenseIndex value) {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::set(i, *this, value);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::set(i, *this, value);
   }
 
   EIGEN_DEVICE_FUNC constexpr IndexList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
@@ -268,14 +313,14 @@ struct IndexList : internal::IndexTuple<FirstType, OtherTypes...> {
   EIGEN_DEVICE_FUNC constexpr IndexList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
 
   EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::value_known_statically(i, *this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
   }
   EIGEN_DEVICE_FUNC constexpr bool all_values_known_statically() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_known_statically(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_known_statically(*this);
   }
 
   EIGEN_DEVICE_FUNC constexpr bool values_statically_known_to_increase() const {
-    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1>::values_up_to_statically_known_to_increase(*this);
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::values_up_to_statically_known_to_increase(*this);
   }
 };
 
@@ -286,6 +331,23 @@ constexpr IndexList<FirstType, OtherTypes...> make_index_list(FirstType val1, Ot
 }
 
 
+template<typename FirstType, typename... OtherTypes>
+struct IndexPairList : internal::IndexTuple<FirstType, OtherTypes...> {
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC constexpr IndexPair<DenseIndex> operator[] (const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, IndexPair<DenseIndex>>::get(i, *this);
+  }
+  EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC void set(const DenseIndex i, const IndexPair<DenseIndex> value) {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...>>::value-1, IndexPair<DenseIndex> >::set(i, *this, value);
+  }
+
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList(const internal::IndexTuple<FirstType, OtherTypes...>& other) : internal::IndexTuple<FirstType, OtherTypes...>(other) { }
+  EIGEN_DEVICE_FUNC  constexpr IndexPairList() : internal::IndexTuple<FirstType, OtherTypes...>() { }
+
+  EIGEN_DEVICE_FUNC constexpr bool value_known_statically(const DenseIndex i) const {
+    return internal::tuple_coeff<internal::array_size<internal::IndexTuple<FirstType, OtherTypes...> >::value-1, DenseIndex>::value_known_statically(i, *this);
+  }
+};
+
 namespace internal {
 
 template<typename FirstType, typename... OtherTypes> size_t array_prod(const IndexList<FirstType, OtherTypes...>& sizes) {
@@ -303,6 +365,13 @@ template<typename FirstType, typename... OtherTypes> struct array_size<const Ind
   static const size_t value = array_size<IndexTuple<FirstType, OtherTypes...> >::value;
 };
 
+template<typename FirstType, typename... OtherTypes> struct array_size<IndexPairList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+template<typename FirstType, typename... OtherTypes> struct array_size<const IndexPairList<FirstType, OtherTypes...> > {
+  static const size_t value = std::tuple_size<std::tuple<FirstType, OtherTypes...> >::value;
+};
+
 template<DenseIndex N, typename FirstType, typename... OtherTypes> EIGEN_DEVICE_FUNC constexpr DenseIndex array_get(IndexList<FirstType, OtherTypes...>& a) {
   return IndexTupleExtractor<N, FirstType, OtherTypes...>::get_val(a);
 }
@@ -472,6 +541,57 @@ struct index_statically_lt_impl<const IndexList<FirstType, OtherTypes...> > {
   }
 };
 
+
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_first_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).first == value);
+  }
+};
+
+
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  EIGEN_DEVICE_FUNC static constexpr bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+template <typename FirstType, typename... OtherTypes>
+struct index_pair_second_statically_eq_impl<const IndexPairList<FirstType, OtherTypes...> > {
+  EIGEN_DEVICE_FUNC static constexpr bool run(const DenseIndex i, const DenseIndex value) {
+    return IndexPairList<FirstType, OtherTypes...>().value_known_statically(i) &
+        (IndexPairList<FirstType, OtherTypes...>().operator[](i).second == value);
+  }
+};
+
+
 }  // end namespace internal
 }  // end namespace Eigen
 
@@ -482,53 +602,69 @@ namespace internal {
 
 template <typename T>
 struct index_known_statically_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(const DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(const DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct all_indices_known_statically_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return false;
   }
 };
 
 template <typename T>
 struct indices_statically_known_to_increase_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run() {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run() {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_eq_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_ne_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_gt_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
 template <typename T>
 struct index_statically_lt_impl {
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static bool run(DenseIndex, DenseIndex) {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Tx>
+struct index_pair_first_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
+    return false;
+  }
+};
+
+template <typename Tx>
+struct index_pair_second_statically_eq_impl {
+  static EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool run(DenseIndex, DenseIndex) {
     return false;
   }
 };
 
+
+
 }  // end namespace internal
 }  // end namespace Eigen
 
@@ -572,6 +708,16 @@ static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_statically_lt(DenseIndex i,
   return index_statically_lt_impl<T>::run(i, value);
 }
 
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_first_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_pair_first_statically_eq_impl<T>::run(i, value);
+}
+
+template <typename T>
+static EIGEN_DEVICE_FUNC EIGEN_CONSTEXPR bool index_pair_second_statically_eq(DenseIndex i, DenseIndex value) {
+  return index_pair_second_statically_eq_impl<T>::run(i, value);
+}
+
 }  // end namespace internal
 }  // end namespace Eigen
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
index de2f67d74..f391fb9ee 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInflation.h
@@ -189,7 +189,7 @@ struct TensorEvaluator<const TensorInflationOp<Strides, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
index 2d223140e..33edc49e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorInitializer.h
@@ -10,7 +10,7 @@
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
 #define EIGEN_CXX11_TENSOR_TENSOR_INITIALIZER_H
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
 
 #include <initializer_list>
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 33c6c1b0f..ede3939c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -29,25 +29,47 @@ namespace Eigen {
 namespace internal {
 
 namespace {
+
   // Note: result is undefined if val == 0
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
 #ifdef __CUDA_ARCH__
-    return (sizeof(T) == 8) ? __clzll(val) : __clz(val);
+    return __clz(val);
 #elif EIGEN_COMP_MSVC
-	unsigned long index;
-	if (sizeof(T) == 8) {
-      _BitScanReverse64(&index, val);
-    } else {
-      _BitScanReverse(&index, val);
-    }
-    return (sizeof(T) == 8) ? 63 - index : 31 - index;
+    unsigned long index;
+    _BitScanReverse(&index, val);
+    return 31 - index;
+#else
+    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+  {
+#ifdef __CUDA_ARCH__
+    return __clzll(val);
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+    unsigned long index;
+    _BitScanReverse64(&index, val);
+    return 63 - index;
+#elif EIGEN_COMP_MSVC
+    // MSVC's _BitScanReverse64 is not available for 32bits builds.
+    unsigned int lo = (unsigned int)(val&0xffffffff);
+    unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+    int n;
+    if(hi==0)
+      n = 32 + count_leading_zeros<unsigned int>(lo);
+    else
+      n = count_leading_zeros<unsigned int>(hi);
+    return n;
 #else
     EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return (sizeof(T) == 8) ?
-      __builtin_clzll(static_cast<uint64_t>(val)) :
-      __builtin_clz(static_cast<uint32_t>(val));
+    return __builtin_clzll(static_cast<uint64_t>(val));
 #endif
   }
 
@@ -98,7 +120,9 @@ namespace {
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
-      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+                                               - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+                                               + TensorUInt128<static_val<0>, static_val<1> >(1);
       return static_cast<uint64_t>(result);
 #endif
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
index 8ed71f838..ee0078bbc 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMacros.h
@@ -28,7 +28,7 @@
 
 // SFINAE requires variadic templates
 #ifndef __CUDACC__
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   // SFINAE doesn't work for gcc <= 4.7
   #ifdef EIGEN_COMP_GNUC
     #if EIGEN_GNUC_AT_LEAST(4,8)
@@ -44,7 +44,7 @@
     typename internal::enable_if< ( __condition__ ) , int >::type = 0
 
 
-#if defined(EIGEN_HAS_CONSTEXPR)
+#if EIGEN_HAS_CONSTEXPR
 #define EIGEN_CONSTEXPR constexpr
 #else
 #define EIGEN_CONSTEXPR
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
index 9ebd9172b..6fb4f4a31 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMap.h
@@ -57,7 +57,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       EIGEN_STATIC_ASSERT((0 == NumIndices || NumIndices == Dynamic), YOU_MADE_A_PROGRAMMING_MISTAKE)
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE TensorMap(PointerArgType dataPtr, Index firstDimension, IndexTypes... otherDimensions) : m_data(dataPtr), m_dimensions(firstDimension, otherDimensions...) {
       // The number of dimensions used to construct a tensor must be equal to the rank of the tensor.
@@ -140,7 +140,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       return m_data[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices) const
     {
@@ -227,7 +227,7 @@ template<typename PlainObjectType, int Options_> class TensorMap : public Tensor
       return m_data[index];
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Scalar& operator()(Index firstIndex, Index secondIndex, IndexTypes... otherIndices)
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index cd04716bd..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
 
 // Default packet types
 template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
   typedef typename internal::packet_traits<Scalar>::type type;
-  enum { size = internal::unpacket_traits<type>::size };
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
 template <>
-struct PacketType<float, GpuDevice> {
-  typedef float4 type;
-  static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
-  typedef double2 type;
+struct PacketType<half, GpuDevice> {
+  typedef half2 type;
   static const int size = 2;
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasArg    = 0,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasBlend  = 0,
+
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 0,
+    HasLog10  = 0,
+    HasPow    = 1,
+  };
 };
 #endif
 
@@ -112,6 +129,20 @@ bool operator!=(const Tuple<U, V>& x, const Tuple<U, V>& y) {
 }
 
 
+// Can't use std::pairs on cuda devices
+template <typename Idx> struct IndexPair {
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair() : first(0), second(0) {}
+  EIGEN_CONSTEXPR EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE IndexPair(Idx f, Idx s) : first(f), second(s) {}
+
+  EIGEN_DEVICE_FUNC void set(IndexPair<Idx> val) {
+    first = val.first;
+    second = val.second;
+  }
+
+  Idx first;
+  Idx second;
+};
+
 
 #ifdef EIGEN_HAS_SFINAE
 namespace internal {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index bfa65a607..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
 
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
@@ -409,7 +409,7 @@ struct TensorEvaluator<const TensorSlicingOp<StartIndices, Sizes, ArgType>, Devi
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     const int packetSize = internal::unpacket_traits<PacketReturnType>::size;
-    EIGEN_STATIC_ASSERT(packetSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((packetSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+packetSize-1 < internal::array_prod(dimensions()));
 
     Index inputIndices[] = {0, 0};
@@ -603,6 +603,286 @@ struct TensorEvaluator<TensorSlicingOp<StartIndices, Sizes, ArgType>, Device>
 };
 
 
+
+namespace internal {
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct traits<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> > : public traits<XprType>
+{
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprTraits::Index Index;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = array_size<StartIndices>::value;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, Eigen::Dense>
+{
+  typedef const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>& type;
+};
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+struct nested<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType>, 1, typename eval<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >::type>
+{
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> type;
+};
+
+}  // end namespace internal
+
+
+template<typename StartIndices, typename StopIndices, typename Strides, typename XprType>
+class TensorStridingSlicingOp : public TensorBase<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, XprType> >
+{
+  public:
+  typedef typename internal::traits<TensorStridingSlicingOp>::Scalar Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename internal::nested<TensorStridingSlicingOp>::type Nested;
+  typedef typename internal::traits<TensorStridingSlicingOp>::StorageKind StorageKind;
+  typedef typename internal::traits<TensorStridingSlicingOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorStridingSlicingOp(
+    const XprType& expr, const StartIndices& startIndices,
+    const StopIndices& stopIndices, const Strides& strides)
+      : m_xpr(expr), m_startIndices(startIndices), m_stopIndices(stopIndices),
+        m_strides(strides) {}
+
+    EIGEN_DEVICE_FUNC
+    const StartIndices& startIndices() const { return m_startIndices; }
+    EIGEN_DEVICE_FUNC
+    const StartIndices& stopIndices() const { return m_stopIndices; }
+    EIGEN_DEVICE_FUNC
+    const StartIndices& strides() const { return m_strides; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename XprType::Nested>::type&
+    expression() const { return m_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const TensorStridingSlicingOp& other)
+    {
+      typedef TensorAssignOp<TensorStridingSlicingOp, const TensorStridingSlicingOp> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+    template<typename OtherDerived>
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE TensorStridingSlicingOp& operator = (const OtherDerived& other)
+    {
+      typedef TensorAssignOp<TensorStridingSlicingOp, const OtherDerived> Assign;
+      Assign assign(*this, other);
+      internal::TensorExecutor<const Assign, DefaultDevice>::run(
+          assign, DefaultDevice());
+      return *this;
+    }
+
+  protected:
+    typename XprType::Nested m_xpr;
+    const StartIndices m_startIndices;
+    const StopIndices m_stopIndices;
+    const Strides m_strides;
+};
+
+// Eval as rvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static const int NumDims = internal::array_size<Strides>::value;
+
+  enum {
+    // Alignment can't be guaranteed at compile time since it depends on the
+    // slice offsets and sizes.
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+      : m_impl(op.expression(), device), m_device(device), m_strides(op.strides())
+  {
+    // Handle degenerate intervals by gracefully clamping and allowing m_dimensions to be zero
+    DSizes<Index,NumDims> startIndicesClamped, stopIndicesClamped;
+    for (size_t i = 0; i < internal::array_size<Dimensions>::value; ++i) {
+      eigen_assert(m_strides[i] != 0 && "0 stride is invalid");
+      if(m_strides[i]>0){
+        startIndicesClamped[i] = clamp(op.startIndices()[i], 0, m_impl.dimensions()[i]);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], 0, m_impl.dimensions()[i]);
+      }else{
+        /* implies m_strides[i]<0 by assert */
+        startIndicesClamped[i] = clamp(op.startIndices()[i], -1, m_impl.dimensions()[i] - 1);
+        stopIndicesClamped[i] = clamp(op.stopIndices()[i], -1, m_impl.dimensions()[i] - 1);
+      }
+      m_startIndices[i] = startIndicesClamped[i];
+    }
+
+    const typename TensorEvaluator<ArgType, Device>::Dimensions& input_dims = m_impl.dimensions();
+
+    // check for degenerate intervals and compute output tensor shape
+    bool degenerate = false;;
+    for(int i = 0; i < NumDims; i++){
+      Index interval = stopIndicesClamped[i] - startIndicesClamped[i];
+      if(interval == 0 || ((interval<0) != (m_strides[i]<0))){
+        m_dimensions[i] = 0;
+        degenerate = true;
+      }else{
+        m_dimensions[i] = interval / m_strides[i]
+                          + (interval % m_strides[i] != 0 ? 1 : 0);
+        eigen_assert(m_dimensions[i] >= 0);
+      }
+    }
+    Strides output_dims = m_dimensions;
+
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      m_inputStrides[0] = m_strides[0];
+      m_offsets[0] = startIndicesClamped[0];
+      Index previousDimProduct = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        previousDimProduct *= input_dims[i-1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      // Don't initialize m_fastOutputStrides[0] since it won't ever be accessed.
+      m_outputStrides[0] = 1;
+      for (int i = 1; i < NumDims; ++i) {
+        m_outputStrides[i] = m_outputStrides[i-1] * output_dims[i-1];
+        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+      }
+    } else {
+      m_inputStrides[NumDims-1] = m_strides[NumDims-1];
+      m_offsets[NumDims-1] = startIndicesClamped[NumDims-1];
+      Index previousDimProduct = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        previousDimProduct *= input_dims[i+1];
+        m_inputStrides[i] = previousDimProduct * m_strides[i];
+        m_offsets[i] = startIndicesClamped[i] * previousDimProduct;
+      }
+
+      m_outputStrides[NumDims-1] = 1;
+      for (int i = NumDims - 2; i >= 0; --i) {
+        m_outputStrides[i] = m_outputStrides[i+1] * output_dims[i+1];
+        // NOTE: if tensor is degenerate, we send 1 to prevent TensorIntDivisor constructor crash
+        m_fastOutputStrides[i] = internal::TensorIntDivisor<Index>(degenerate ? 1 : m_outputStrides[i]);
+      }
+    }
+    m_block_total_size_max = numext::maxi(static_cast<std::size_t>(1),
+                                          device.lastLevelCacheSize() /
+                                          sizeof(Scalar));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
+
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_impl.coeff(srcCoeff(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
+    return m_impl.costPerCoeff(vectorized) + TensorOpCost(0, 0, NumDims);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Scalar* data() const {
+    return NULL;
+  }
+
+ protected:
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index srcCoeff(Index index) const
+  {
+    Index inputIndex = 0;
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = NumDims - 1; i >= 0; --i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    } else {
+      for (int i = 0; i < NumDims; ++i) {
+        const Index idx = index / m_fastOutputStrides[i];
+        inputIndex += idx * m_inputStrides[i] + m_offsets[i];
+        index -= idx * m_outputStrides[i];
+      }
+    }
+    return inputIndex;
+  }
+
+  static EIGEN_STRONG_INLINE Index clamp(Index value, Index min, Index max) {
+    return numext::maxi(min, numext::mini(max,value));
+  }
+
+  array<Index, NumDims> m_outputStrides;
+  array<internal::TensorIntDivisor<Index>, NumDims> m_fastOutputStrides;
+  array<Index, NumDims> m_inputStrides;
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  DSizes<Index, NumDims> m_startIndices; // clamped startIndices
+  DSizes<Index, NumDims> m_dimensions;
+  DSizes<Index, NumDims> m_offsets; // offset in a flattened shape
+  const Strides m_strides;
+  std::size_t m_block_total_size_max;
+};
+
+// Eval as lvalue
+template<typename StartIndices, typename StopIndices, typename Strides, typename ArgType, typename Device>
+struct TensorEvaluator<TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+  : public TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device>
+{
+  typedef TensorEvaluator<const TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType>, Device> Base;
+  typedef TensorStridingSlicingOp<StartIndices, StopIndices, Strides, ArgType> XprType;
+  static const int NumDims = internal::array_size<Strides>::value;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = false,
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = TensorEvaluator<ArgType, Device>::CoordAccess,
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
+    : Base(op, device)
+    { }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<Scalar>::type ScalarNonConst;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef Strides Dimensions;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType& coeffRef(Index index)
+  {
+    return this->m_impl.coeffRef(this->srcCoeff(index));
+  }
+};
+
+
 } // end namespace Eigen
 
 #endif // EIGEN_CXX11_TENSOR_TENSOR_MORPHING_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
index 88b838b27..647bcf108 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPadding.h
@@ -93,7 +93,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = false,
+    IsAligned = true,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = true,
@@ -106,7 +106,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     // The padding op doesn't change the rank of the tensor. Directly padding a scalar would lead
     // to a vector, which doesn't make sense. Instead one should reshape the scalar into a vector
     // of 1 element first and then pad.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     // Compute dimensions
     m_dimensions = m_impl.dimensions();
@@ -150,27 +150,26 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
       for (int i = NumDims - 1; i > 0; --i) {
         const Index idx = index / m_outputStrides[i];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+        if (isPaddingAtIndexForDim(idx, i)) {
           return m_paddingValue;
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i];
       }
-      if (index < m_padding[0].first || index >= m_dimensions[0] - m_padding[0].second) {
+      if (isPaddingAtIndexForDim(index, 0)) {
         return m_paddingValue;
       }
       inputIndex += (index - m_padding[0].first);
     } else {
       for (int i = 0; i < NumDims - 1; ++i) {
         const Index idx = index / m_outputStrides[i+1];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
+        if (isPaddingAtIndexForDim(idx, i)) {
           return m_paddingValue;
         }
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
         index -= idx * m_outputStrides[i+1];
       }
-      if (index < m_padding[NumDims-1].first ||
-          index >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
+      if (isPaddingAtIndexForDim(index, NumDims-1)) {
         return m_paddingValue;
       }
       inputIndex += (index - m_padding[NumDims-1].first);
@@ -187,43 +186,6 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     return packetRowMajor(index);
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(const array<Index, NumDims>& coords) const
-  {
-    Index inputIndex;
-    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      {
-        const Index idx = coords[0];
-        if (idx < m_padding[0].first || idx >= m_dimensions[0] - m_padding[0].second) {
-          return m_paddingValue;
-        }
-        inputIndex = idx - m_padding[0].first;
-      }
-      for (int i = 1; i < NumDims; ++i) {
-        const Index idx = coords[i];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return m_paddingValue;
-        }
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-      }
-    } else {
-      {
-        const Index idx = coords[NumDims-1];
-        if (idx < m_padding[NumDims-1].first || idx >= m_dimensions[NumDims-1] - m_padding[NumDims-1].second) {
-          return m_paddingValue;
-        }
-        inputIndex = idx - m_padding[NumDims-1].first;
-      }
-      for (int i = NumDims - 2; i >= 0; --i) {
-        const Index idx = coords[i];
-        if (idx < m_padding[i].first || idx >= m_dimensions[i] - m_padding[i].second) {
-          return m_paddingValue;
-        }
-        inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
-      }
-    }
-    return m_impl.coeff(inputIndex);
-  }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool vectorized) const {
     TensorOpCost cost = m_impl.costPerCoeff(vectorized);
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -239,6 +201,40 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC Scalar* data() const { return NULL; }
 
  private:
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isPaddingAtIndexForDim(
+      Index index, int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return (!internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0) &&
+            index < m_padding[dim_index].first) ||
+        (!internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0) &&
+         index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#else
+    return (index < m_padding[dim_index].first) ||
+           (index >= m_dimensions[dim_index] - m_padding[dim_index].second);
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isLeftPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_first_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool isRightPaddingCompileTimeZero(
+      int dim_index) const {
+#if defined(EIGEN_HAS_INDEX_LIST)
+    return internal::index_pair_second_statically_eq<PaddingDimensions>(dim_index, 0);
+#else
+    EIGEN_UNUSED_VARIABLE(dim_index);
+    return false;
+#endif
+  }
+
+
   void updateCostPerDimension(TensorOpCost& cost, int i, bool first) const {
     const double in = static_cast<double>(m_impl.dimensions()[i]);
     const double out = in + m_padding[i].first + m_padding[i].second;
@@ -261,7 +257,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetColMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
@@ -273,15 +269,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i];
       const Index lastPaddedRight = m_outputStrides[i+1];
 
-      if (last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -299,15 +295,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     const Index firstPaddedRight = (m_dimensions[0] - m_padding[0].second);
     const Index lastPaddedRight = m_outputStrides[1];
 
-    if (last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(0) && last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(0) && first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+    else if ((isLeftPaddingCompileTimeZero(0) && isRightPaddingCompileTimeZero(0)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[0].first);
       return m_impl.template packet<Unaligned>(inputIndex);
@@ -318,7 +314,7 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packetRowMajor(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     const Index initialIndex = index;
@@ -331,15 +327,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
       const Index firstPaddedRight = (m_dimensions[i] - m_padding[i].second) * m_outputStrides[i+1];
       const Index lastPaddedRight = m_outputStrides[i];
 
-      if (last < lastPaddedLeft) {
+      if (!isLeftPaddingCompileTimeZero(i) && last < lastPaddedLeft) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= firstPaddedRight && last < lastPaddedRight) {
+      else if (!isRightPaddingCompileTimeZero(i) && first >= firstPaddedRight && last < lastPaddedRight) {
         // all the coefficient are in the padding zone.
         return internal::pset1<PacketReturnType>(m_paddingValue);
       }
-      else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+      else if ((isLeftPaddingCompileTimeZero(i) && isRightPaddingCompileTimeZero(i)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
         // all the coefficient are between the 2 padding zones.
         const Index idx = index / m_outputStrides[i+1];
         inputIndex += (idx - m_padding[i].first) * m_inputStrides[i];
@@ -357,15 +353,15 @@ struct TensorEvaluator<const TensorPaddingOp<PaddingDimensions, ArgType>, Device
     const Index firstPaddedRight = (m_dimensions[NumDims-1] - m_padding[NumDims-1].second);
     const Index lastPaddedRight = m_outputStrides[NumDims-1];
 
-    if (last < lastPaddedLeft) {
+    if (!isLeftPaddingCompileTimeZero(NumDims-1) && last < lastPaddedLeft) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= firstPaddedRight && last < lastPaddedRight) {
+    else if (!isRightPaddingCompileTimeZero(NumDims-1) && first >= firstPaddedRight && last < lastPaddedRight) {
       // all the coefficient are in the padding zone.
       return internal::pset1<PacketReturnType>(m_paddingValue);
     }
-    else if (first >= lastPaddedLeft && last < firstPaddedRight) {
+    else if ((isLeftPaddingCompileTimeZero(NumDims-1) && isRightPaddingCompileTimeZero(NumDims-1)) || (first >= lastPaddedLeft && last < firstPaddedRight)) {
       // all the coefficient are between the 2 padding zones.
       inputIndex += (index - m_padding[NumDims-1].first);
       return m_impl.template packet<Unaligned>(inputIndex);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
index a87e45330..886a254f6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorPatch.h
@@ -184,7 +184,7 @@ struct TensorEvaluator<const TensorPatchOp<PatchDim, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index output_stride_index = (static_cast<int>(Layout) == static_cast<int>(ColMajor)) ? NumDims - 1 : 0;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
new file mode 100644
index 000000000..1655a813e
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRandom.h
@@ -0,0 +1,276 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+#define EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
+
+namespace Eigen {
+namespace internal {
+
+namespace {
+
+EIGEN_DEVICE_FUNC uint64_t get_random_seed() {
+#ifdef __CUDA_ARCH__
+  // We don't support 3d kernels since we currently only use 1 and
+  // 2d kernels.
+  assert(threadIdx.z == 0);
+  return clock64() +
+      blockIdx.x * blockDim.x + threadIdx.x +
+      gridDim.x * blockDim.x * (blockIdx.y * blockDim.y + threadIdx.y);
+
+#elif defined _WIN32
+  // Use the current time as a baseline.
+  SYSTEMTIME st;
+  GetSystemTime(&st);
+  int time = st.wSecond + 1000 * st.wMilliseconds;
+  // Mix in a random number to make sure that we get different seeds if
+  // we try to generate seeds faster than the clock resolution.
+  // We need 2 random values since the generator only generate 16 bits at
+  // a time (https://msdn.microsoft.com/en-us/library/398ax69y.aspx)
+  int rnd1 = ::rand();
+  int rnd2 = ::rand();
+  uint64_t rnd = (rnd1 | rnd2 << 16) ^ time;
+  return rnd;
+
+#elif defined __APPLE__
+  // Same approach as for win32, except that the random number generator
+  // is better (// https://developer.apple.com/legacy/library/documentation/Darwin/Reference/ManPages/man3/random.3.html#//apple_ref/doc/man/3/random).
+  uint64_t rnd = ::random() ^ mach_absolute_time();
+  return rnd;
+
+#else
+  // Augment the current time with pseudo random number generation
+  // to ensure that we get different seeds if we try to generate seeds
+  // faster than the clock resolution.
+  timespec ts;
+  clock_gettime(CLOCK_REALTIME, &ts);
+  uint64_t rnd = ::random() ^ ts.tv_nsec;
+  return rnd;
+#endif
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned PCG_XSH_RS_generator(uint64_t* state) {
+  // TODO: Unify with the implementation in the non blocking thread pool.
+  uint64_t current = *state;
+  // Update the internal state
+  *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+  // Generate the random output (using the PCG-XSH-RS scheme)
+  return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
+}
+
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE uint64_t PCG_XSH_RS_state(uint64_t seed) {
+  seed = seed ? seed : get_random_seed();
+  return seed * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+}
+
+}  // namespace
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeUniform(uint64_t* state) {
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  return static_cast<T>(rnd);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Eigen::half RandomToTypeUniform<Eigen::half>(uint64_t* state) {
+  Eigen::half result;
+  // Generate 10 random bits for the mantissa
+  unsigned rnd = PCG_XSH_RS_generator(state);
+  result.x = static_cast<uint16_t>(rnd & 0x3ffu);
+  // Set the exponent
+  result.x |= (static_cast<uint16_t>(15) << 10);
+  // Return the final result
+  return result - Eigen::half(1.0f);
+}
+
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float RandomToTypeUniform<float>(uint64_t* state) {
+  typedef union {
+    uint32_t raw;
+    float fp;
+  } internal;
+  internal result;
+  // Generate 23 random bits for the mantissa mantissa
+  const unsigned rnd = PCG_XSH_RS_generator(state);
+  result.raw = rnd & 0x7fffffu;
+  // Set the exponent
+  result.raw |= (static_cast<uint32_t>(127) << 23);
+  // Return the final result
+  return result.fp - 1.0f;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double RandomToTypeUniform<double>(uint64_t* state) {
+  typedef union {
+    uint64_t raw;
+    double dp;
+  } internal;
+  internal result;
+  result.raw = 0;
+  // Generate 52 random bits for the mantissa
+  // First generate the upper 20 bits
+  unsigned rnd1 = PCG_XSH_RS_generator(state) & 0xfffffu;
+  // The generate the lower 32 bits
+  unsigned rnd2 = PCG_XSH_RS_generator(state);
+  result.raw = (static_cast<uint64_t>(rnd1) << 32) | rnd2;
+  // Set the exponent
+  result.raw |= (static_cast<uint64_t>(1023) << 52);
+  // Return the final result
+  return result.dp - 1.0;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeUniform<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeUniform<float>(state),
+                             RandomToTypeUniform<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeUniform<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeUniform<double>(state),
+                              RandomToTypeUniform<double>(state));
+}
+
+template <typename T> class UniformRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE UniformRandomGenerator(
+      const UniformRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+  template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeUniform<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeUniform<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+template <typename Scalar>
+struct functor_traits<UniformRandomGenerator<Scalar> > {
+  enum {
+    // Rough estimate for floating point, multiplied by ceil(sizeof(T) / sizeof(float)).
+    Cost = 12 * NumTraits<Scalar>::AddCost *
+           ((sizeof(Scalar) + sizeof(float) - 1) / sizeof(float)),
+    PacketAccess = UniformRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+
+template <typename T> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+T RandomToTypeNormal(uint64_t* state) {
+  // Use the ratio of uniform method to generate numbers following a normal
+  // distribution. See for example Numerical Recipes chapter 7.3.9 for the
+  // details.
+  T u, v, q;
+  do {
+    u = RandomToTypeUniform<T>(state);
+    v = T(1.7156) * (RandomToTypeUniform<T>(state) - T(0.5));
+    const T x = u - T(0.449871);
+    const T y = numext::abs(v) + T(0.386595);
+    q = x*x + y * (T(0.196)*y - T(0.25472)*x);
+  } while (q > T(0.27597) &&
+           (q > T(0.27846) || v*v > T(-4) * numext::log(u) * u*u));
+
+  return v/u;
+}
+
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<float> RandomToTypeNormal<std::complex<float> >(uint64_t* state) {
+  return std::complex<float>(RandomToTypeNormal<float>(state),
+                             RandomToTypeNormal<float>(state));
+}
+template <> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+std::complex<double> RandomToTypeNormal<std::complex<double> >(uint64_t* state) {
+  return std::complex<double>(RandomToTypeNormal<double>(state),
+                              RandomToTypeNormal<double>(state));
+}
+
+
+template <typename T> class NormalRandomGenerator {
+ public:
+  static const bool PacketAccess = true;
+
+  // Uses the given "seed" if non-zero, otherwise uses a random seed.
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(uint64_t seed = 0) {
+    m_state = PCG_XSH_RS_state(seed);
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE NormalRandomGenerator(
+      const NormalRandomGenerator& other) {
+    m_state = other.m_state;
+  }
+
+ template<typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  T operator()(Index i) const {
+    uint64_t local_state = m_state + i;
+    T result = RandomToTypeNormal<T>(&local_state);
+    m_state = local_state;
+    return result;
+  }
+
+  template<typename Packet, typename Index> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  Packet packetOp(Index i) const {
+    const int packetSize = internal::unpacket_traits<Packet>::size;
+    EIGEN_ALIGN_MAX T values[packetSize];
+    uint64_t local_state = m_state + i;
+    for (int j = 0; j < packetSize; ++j) {
+      values[j] = RandomToTypeNormal<T>(&local_state);
+    }
+    m_state = local_state;
+    return internal::pload<Packet>(values);
+  }
+
+ private:
+  mutable uint64_t m_state;
+};
+
+
+template <typename Scalar>
+struct functor_traits<NormalRandomGenerator<Scalar> > {
+  enum {
+    // On average, we need to generate about 3 random numbers
+    // 15 mul, 8 add, 1.5 logs
+    Cost = 3 * functor_traits<UniformRandomGenerator<Scalar> >::Cost +
+           15 * NumTraits<Scalar>::AddCost + 8 * NumTraits<Scalar>::AddCost +
+           3 * functor_traits<scalar_log_op<Scalar> >::Cost / 2,
+    PacketAccess = NormalRandomGenerator<Scalar>::PacketAccess
+  };
+};
+
+
+} // end namespace internal
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_RANDOM_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 885295f0a..a87777b22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -87,7 +87,7 @@ struct preserve_inner_most_dims {
   static const bool value = false;
 };
 
-#if defined(EIGEN_HAS_CONSTEXPR) && defined(EIGEN_HAS_VARIADIC_TEMPLATES)
+#if EIGEN_HAS_CONSTEXPR && EIGEN_HAS_VARIADIC_TEMPLATES
 template <typename ReducedDims, int NumTensorDims>
 struct are_inner_most_dims<ReducedDims, NumTensorDims, ColMajor>{
   static const bool tmp1 = indices_statically_known_to_increase<ReducedDims>();
@@ -122,7 +122,7 @@ struct preserve_inner_most_dims<ReducedDims, NumTensorDims, RowMajor>{
 template <int DimIndex, typename Self, typename Op>
 struct GenericDimReducer {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::CoeffReturnType* accum) {
-    EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     for (int j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
       GenericDimReducer<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -183,7 +183,7 @@ struct InnerMostDimPreserver {
 template <int DimIndex, typename Self, typename Op>
 struct InnerMostDimPreserver<DimIndex, Self, Op, true> {
   static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const Self& self, typename Self::Index firstIndex, Op& reducer, typename Self::PacketReturnType* accum) {
-    EIGEN_STATIC_ASSERT(DimIndex > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((DimIndex > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
     for (typename Self::Index j = 0; j < self.m_reducedDims[DimIndex]; ++j) {
       const typename Self::Index input = firstIndex + j * self.m_reducedStrides[DimIndex];
       InnerMostDimPreserver<DimIndex-1, Self, Op>::reduce(self, input, reducer, accum);
@@ -248,16 +248,12 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
       *output = reducer.finalize(reducer.initialize());
       return;
     }
-#ifdef EIGEN_USE_COST_MODEL
     const TensorOpCost cost =
         self.m_impl.costPerCoeff(Vectorizable) +
         TensorOpCost(0, 0, internal::functor_traits<Op>::Cost, Vectorizable,
                      PacketSize);
     const int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         num_coeffs, cost, device.numThreads());
-#else
-    const int num_threads = device.numThreads();
-#endif
     if (num_threads == 1) {
       *output =
           InnerMostDimReducer<Self, Op, Vectorizable>::reduce(self, 0, num_coeffs, reducer);
@@ -268,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);
 
-    Barrier barrier(numblocks);
+    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
     for (Index i = 0; i < numblocks; ++i) {
       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
@@ -320,7 +316,18 @@ struct OuterReducer {
 
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename S, typename R, typename I>
+__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+template <int B, int N, typename S, typename R, typename I>
+__global__ void FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+template <int NPT, typename S, typename R, typename I>
+__global__ void InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+
+#endif
 
 template <int NPT, typename S, typename R, typename I>
 __global__ void InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
@@ -396,7 +403,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device), m_reducer(op.reducer()), m_result(NULL), m_device(device)
   {
-    EIGEN_STATIC_ASSERT(NumInputDims >= NumReducedDims, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumInputDims >= NumReducedDims), YOU_MADE_A_PROGRAMMING_MISTAKE);
     EIGEN_STATIC_ASSERT((!ReducingInnerMostDims | !PreservingInnerMostDims | (NumReducedDims == NumInputDims)),
                         YOU_MADE_A_PROGRAMMING_MISTAKE);
 
@@ -464,22 +471,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const { return m_dimensions; }
 
-  static bool size_large_enough(Index total_size) {
-#ifndef EIGEN_USE_COST_MODEL
-    return total_size > 1024 * 1024;
-#else
-    return true || total_size;
-#endif
-  }
-
   EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC bool evalSubExprsIfNeeded(CoeffReturnType* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
 
     // Use the FullReducer if possible.
-    if (RunningFullReduction && internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
+    if (RunningFullReduction &&
+        internal::FullReducer<Self, Op, Device>::HasOptimizedImplementation &&
         ((RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) ||
-         (!RunningOnGPU && size_large_enough(internal::array_prod(m_impl.dimensions()))))) {
-
+         !RunningOnGPU)) {
       bool need_assign = false;
       if (!data) {
         m_result = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType)));
@@ -493,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     // Attempt to use an optimized reduction.
-    else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -506,8 +505,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
+        }
         Op reducer(m_reducer);
-        return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
 
       bool preserving_inner_dims = true;
@@ -522,8 +538,25 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
+        }
         Op reducer(m_reducer);
-        return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
     }
     return true;
@@ -533,13 +566,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     m_impl.cleanup();
     if (m_result) {
       m_device.deallocate(m_result);
+      m_result = NULL;
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (RunningFullReduction && m_result) {
-      return *m_result;
+    if ((RunningFullReduction || RunningOnGPU) && m_result) {
+      return *(m_result + index);
     }
     Op reducer(m_reducer);
     if (ReducingInnerMostDims || RunningFullReduction) {
@@ -558,8 +592,12 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
+    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+    if (RunningOnGPU && m_result) {
+      return internal::pload<PacketReturnType>(m_result + index);
+    }
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     if (ReducingInnerMostDims) {
@@ -617,11 +655,19 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
+#ifdef EIGEN_HAS_CUDA_FP16
+  template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
+  template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernelHalfFloat(R, const S, I, I, half*);
+#endif
   template <int NPT, typename S, typename R, typename I> friend void internal::InnerReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
+
   template <int NPT, typename S, typename R, typename I> friend void internal::OuterReductionKernel(R, const S, I, I, typename S::CoeffReturnType*);
 #endif
 
+  template <typename S, typename O, typename D> friend struct internal::InnerReducer;
+
   // Returns the Index in the input tensor of the first value that needs to be
   // used to compute the reduction at output index "index".
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Index firstInput(Index index) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index fd2587dd5..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,8 +67,41 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
 #endif
 }
 
-template <typename T>
-__device__ inline void atomicReduce(T* output, T accum, SumReducer<T>&) {
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <template <typename T> class R>
+__device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
+  unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
+  unsigned int newval = oldval;
+  reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+  if (newval == oldval) {
+    return;
+  }
+  unsigned int readback;
+  while ((readback = atomicCAS((unsigned int*)output, oldval, newval)) != oldval) {
+    oldval = readback;
+    newval = oldval;
+    reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
+    if (newval == oldval) {
+      return;
+    }
+  }
+}
+#endif
+
+template <>
+__device__ inline void atomicReduce(float* output, float accum, SumReducer<float>&) {
 #if __CUDA_ARCH__ >= 300
   atomicAdd(output, accum);
 #else
@@ -86,17 +119,44 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
   }
 }
 
+
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output) {
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+  // Initialize the output value
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    *output = reducer.initialize();
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
   }
 
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
   typename Self::CoeffReturnType accum = reducer.initialize();
   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
   for (Index i = 0; i < max_iter; i+=BlockSize) {
@@ -108,50 +168,203 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
 
 #pragma unroll
   for (int offset = warpSize/2; offset > 0; offset /= 2) {
-    reducer.reduce(__shfl_down(accum, offset), &accum);
+    reducer.reduce(__shfl_down(accum, offset, warpSize), &accum);
   }
 
   if ((threadIdx.x & (warpSize - 1)) == 0) {
     atomicReduce(output, accum, reducer);
   }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half2* scratch) {
+  eigen_assert(blockDim.x == 1);
+  eigen_assert(gridDim.x == 1);
+  if (num_coeffs % 2 != 0) {
+    half last = input.m_impl.coeff(num_coeffs-1);
+    *scratch = __halves2half2(last, reducer.initialize());
+  } else {
+    *scratch = reducer.template initializePacket<half2>();
+  }
 }
 
+template <typename Self,
+          typename Reducer, typename Index>
+__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index num_packets = num_coeffs / 2;
+  for (Index i = thread_id; i < num_packets; i += num_threads) {
+    ((half2*)output)[i] = reducer.template initializePacket<half2>();
+  }
 
-template <typename Self, typename Op, bool Vectorizable>
-struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+  if (thread_id == 0 && num_coeffs % 2 != 0) {
+    output[num_coeffs-1] = reducer.initialize();
+  }
+}
 
-  template <typename OutputType>
-  static EIGEN_DEVICE_FUNC void run(const Self&, Op&, const GpuDevice&, OutputType*) {
-    assert(false && "Should only be called on floats");
+template <int BlockSize, int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+                                    half* output, half2* scratch) {
+  eigen_assert(NumPerThread % 2 == 0);
+
+  const Index first_index = blockIdx.x * BlockSize * NumPerThread + 2*threadIdx.x;
+
+  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
+  if (gridDim.x == 1 && first_index == 0) {
+    if (num_coeffs % 2 != 0) {
+      half last = input.m_impl.coeff(num_coeffs-1);
+      *scratch = __halves2half2(last, reducer.initialize());
+    } else {
+      *scratch = reducer.template initializePacket<half2>();
+    }
+    __syncthreads();
+  }
+
+  half2 accum = reducer.template initializePacket<half2>();
+  const Index max_iter = numext::mini<Index>((num_coeffs - first_index) / 2, NumPerThread*BlockSize / 2);
+  for (Index i = 0; i < max_iter; i += BlockSize) {
+    const Index index = first_index + 2*i;
+    eigen_assert(index + 1 < num_coeffs);
+    half2 val = input.m_impl.template packet<Unaligned>(index);
+    reducer.reducePacket(val, &accum);
+  }
+
+#pragma unroll
+  for (int offset = warpSize/2; offset > 0; offset /= 2) {
+    reducer.reducePacket(__shfl_down(accum, offset, warpSize), &accum);
   }
 
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output) {
+  if ((threadIdx.x & (warpSize - 1)) == 0) {
+    atomicReduce(scratch, accum, reducer);
+  }
+
+  __syncthreads();
+
+  if (gridDim.x == 1 && first_index == 0) {
+    half tmp = __low2half(*scratch);
+    reducer.reduce(__high2half(*scratch), &tmp);
+    *output = tmp;
+  }
+}
+
+template <typename Op>
+__global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2* scratch) {
+  eigen_assert(threadIdx.x == 1);
+  half tmp = __low2half(*scratch);
+  reducer.reduce(__high2half(*scratch), &tmp);
+  *output = tmp;
+}
+
+#endif
+
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct FullReductionLauncher {
+  static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
+    assert(false && "Should only be called on doubles, floats and half floats");
+  }
+};
+
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
+    typedef typename Self::CoeffReturnType Scalar;
+    const int block_size = 256;
+    const int num_per_thread = 128;
+    const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
 
-    const Index num_coeffs = array_prod(self.m_impl.dimensions());
-    // Don't crash when we're called with an input tensor of size 0.
-    if (num_coeffs == 0) {
-      return;
+    unsigned int* semaphore = NULL;
+    if (num_blocks > 1) {
+      semaphore = device.semaphore();
     }
 
+    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
+  }
+};
+
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, false> {
+  static void run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+  }
+};
+
+template <typename Self, typename Op>
+struct FullReductionLauncher<Self, Op, Eigen::half, true> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs) {
+    typedef typename Self::Index Index;
+
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    half2* scratch = static_cast<half2*>(device.scratchpad());
 
     if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // We initialize the output and the scrathpad outside the reduction kernel when we can't be sure that there
       // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
-                         1, 32, 0, device, reducer.initialize(), 1, output);
+      LAUNCH_CUDA_KERNEL((ReductionInitFullReduxKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_coeffs, scratch);
     }
 
-    LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+    LAUNCH_CUDA_KERNEL((FullReductionKernelHalfFloat<block_size, num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, scratch);
+
+    if (num_blocks > 1) {
+      LAUNCH_CUDA_KERNEL((ReductionCleanupKernelHalfFloat<Op>),
+                         1, 1, 0, device, reducer, output, scratch);
+    }
+  }
+};
+#endif
+
+
+template <typename Self, typename Op, bool Vectorizable>
+struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return;
+    }
+
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
   }
 };
 
@@ -160,6 +373,8 @@ template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                          typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+  typedef typename Self::CoeffReturnType Type;
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
@@ -179,6 +394,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
       output[i] = reducer.initialize();
     }
+    __syncthreads();
   }
 
   for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
@@ -188,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
       const Index col_block = i % input_col_blocks;
       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
 
-      float reduced_val = reducer.initialize();
+      Type reduced_val = reducer.initialize();
 
       for (Index j = 0; j < NumPerThread; j += unroll_times) {
         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
         if (last_col >= num_coeffs_to_reduce) {
-          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col +=blockDim.x) {
-            const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+          for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
             reducer.reduce(val, &reduced_val);
           }
           break;
@@ -217,33 +433,128 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
         atomicReduce(&(output[row]), reduced_val, reducer);
       }
     }
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
+}
+
+#ifdef EIGEN_HAS_CUDA_FP16
+
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread * 2);
+  const Index num_input_blocks = divup<Index>(input_col_blocks * num_preserved_coeffs, 2);
 
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = 2*thread_id;
+    for (; i + 1 < num_preserved_coeffs; i += 2*num_threads) {
+      half* loc = output + i;
+      *((half2*)loc) = reducer.template initializePacket<half2>();
+    }
+    if (i < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
     __syncthreads();
   }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = 2 * (i / input_col_blocks);
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = 2 * (col_block * blockDim.x * NumPerThread + threadIdx.x);
+
+      half2 reduced_val1 = reducer.template initializePacket<half2>();
+      half2 reduced_val2 = reducer.template initializePacket<half2>();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1) * 2;
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+            const half2 val1 = input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val1, &reduced_val1);
+            const half2 val2 = input.m_impl.template packet<Unaligned>((row+1) * num_coeffs_to_reduce + col);
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last1 = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const half2 val1 = __halves2half2(last1, reducer.initialize());
+            reducer.reducePacket(val1, &reduced_val1);
+            const half last2 = input.m_impl.coeff((row+1) * num_coeffs_to_reduce + col);
+            const half2 val2 = __halves2half2(last2, reducer.initialize());
+            reducer.reducePacket(val2, &reduced_val2);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k) * 2;
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>(row * num_coeffs_to_reduce + col), &reduced_val1);
+            reducer.reducePacket(input.m_impl.template packet<Unaligned>((row + 1)* num_coeffs_to_reduce + col), &reduced_val2);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reducePacket(__shfl_down(reduced_val1, offset, warpSize), &reduced_val1);
+        reducer.reducePacket(__shfl_down(reduced_val2, offset, warpSize), &reduced_val2);
+      }
+
+      half val1 =  __low2half(reduced_val1);
+      reducer.reduce(__high2half(reduced_val1), &val1);
+      half val2 =  __low2half(reduced_val2);
+      reducer.reduce(__high2half(reduced_val2), &val2);
+      half2 val = __halves2half2(val1, val2);
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        half* loc = output + row;
+        atomicReduce((half2*)loc, val, reducer);
+      }
+    }
+  }
 }
 
-template <typename Self, typename Op>
-struct InnerReducer<Self, Op, GpuDevice> {
-  // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats.
-  static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+#endif
 
-  template <typename Device, typename OutputType>
-  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
+struct InnerReductionLauncher {
+  static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
     return true;
   }
+};
 
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
-    // It's faster to use the usual code.
-    if (num_coeffs_to_reduce <= 32) {
-      return true;
-    }
-
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
     const int block_size = 256;
     const int num_per_thread = 128;
@@ -259,7 +570,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
       const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / 1024;
       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
                          num_blocks, 1024, 0, device, reducer.initialize(),
                          num_preserved_vals, output);
     }
@@ -271,6 +582,85 @@ struct InnerReducer<Self, Op, GpuDevice> {
   }
 };
 
+#ifdef EIGEN_HAS_CUDA_FP16
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, false> {
+  static bool run(const Self&, Op&, const GpuDevice&, half*, typename Self::Index, typename Self::Index) {
+    assert(false && "Should not be called since there is no packet accessor");
+    return true;
+  }
+};
+
+template <typename Self, typename Op>
+struct InnerReductionLauncher<Self, Op, Eigen::half, true> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, half* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    typedef typename Self::Index Index;
+
+    if (num_preserved_vals % 2 != 0) {
+      // Not supported yet, revert to the slower code path
+      return true;
+    }
+
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const int block_size = /*256*/128;
+    const int num_per_thread = /*128*/64;
+    const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
+    const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / block_size;
+    const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+
+    if (num_blocks > 1) {
+      // We initialize the outputs outside the reduction kernel when we can't be sure that there
+      // won't be a race conditions between multiple thread blocks.
+      const int dyn_blocks = divup<int>(num_preserved_vals, 1024);
+      const int max_blocks = device.getNumCudaMultiProcessors() *
+                           device.maxCudaThreadsPerMultiProcessor() / 1024;
+      const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
+      LAUNCH_CUDA_KERNEL((ReductionInitKernelHalfFloat<Self, Op, Index>),
+                         1, 1, 0, device, reducer, self, num_preserved_vals, output);
+    }
+
+    LAUNCH_CUDA_KERNEL((InnerReductionKernelHalfFloat<num_per_thread, Self, Op, Index>),
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs_to_reduce, num_preserved_vals, output);
+
+    return false;
+  }
+};
+#endif
+
+
+template <typename Self, typename Op>
+struct InnerReducer<Self, Op, GpuDevice> {
+  // Unfortunately nvidia doesn't support well exotic types such as complex,
+  // so reduce the scope of the optimized version of the code to the simple case
+  // of floats and half floats.
+#ifdef EIGEN_HAS_CUDA_FP16
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+      (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
+#else
+  static const bool HasOptimizedImplementation = !Op::IsStateful &&
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
+#endif
+
+  template <typename OutputType>
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
+    const Index num_coeffs = array_prod(self.m_impl.dimensions());
+    // Don't crash when we're called with an input tensor of size 0.
+    if (num_coeffs == 0) {
+      return true;
+    }
+    // It's faster to use the usual code.
+    if (num_coeffs_to_reduce <= 128) {
+      return true;
+    }
+
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+  }
+};
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
@@ -283,6 +673,7 @@ __global__ void OuterReductionKernel(Reducer reducer, const Self input, Index nu
     for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
       output[i] = reducer.initialize();
     }
+    __syncthreads();
   }
 
   // Do the reduction.
@@ -307,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
   // so reduce the scope of the optimized version of the code to the simple case
   // of floats.
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
   template <typename Device, typename OutputType>
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
     return true;
   }
 
@@ -323,7 +714,7 @@ struct OuterReducer<Self, Op, GpuDevice> {
       return true;
     }
 
-     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
+    const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
     const int block_size = 256;
     const int num_per_thread = 16;
     const int dyn_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
index bc92d9e6d..99245f778 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorRef.h
@@ -193,7 +193,7 @@ template<typename PlainObjectType> class TensorRef : public TensorBase<TensorRef
       return m_evaluator->coeff(index);
     }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template<typename... IndexTypes> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const Scalar operator()(Index firstIndex, IndexTypes... otherIndices) const
     {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
index 1a59cc8f7..14e392e36 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReverse.h
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
       : m_impl(op.expression(), device), m_reverse(op.reverse())
   {
     // Reversing a scalar isn't supported yet. It would be a no-op anyway.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     // Compute strides
     m_dimensions = m_impl.dimensions();
@@ -195,7 +195,7 @@ struct TensorEvaluator<const TensorReverseOp<ReverseDimensions, ArgType>, Device
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // TODO(ndjaitly): write a better packing routine that uses
@@ -269,7 +269,7 @@ struct TensorEvaluator<TensorReverseOp<ReverseDimensions, ArgType>, Device>
 
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x) {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     // This code is pilfered from TensorMorphing.h
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
new file mode 100644
index 000000000..8501466ce
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -0,0 +1,287 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Igor Babuschkin <igor@babuschk.in>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+#define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
+namespace Eigen {
+
+namespace internal {
+
+template <typename Op, typename XprType>
+struct traits<TensorScanOp<Op, XprType> >
+    : public traits<XprType> {
+  typedef typename XprType::Scalar Scalar;
+  typedef traits<XprType> XprTraits;
+  typedef typename XprTraits::StorageKind StorageKind;
+  typedef typename XprType::Nested Nested;
+  typedef typename remove_reference<Nested>::type _Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+};
+
+template<typename Op, typename XprType>
+struct eval<TensorScanOp<Op, XprType>, Eigen::Dense>
+{
+  typedef const TensorScanOp<Op, XprType>& type;
+};
+
+template<typename Op, typename XprType>
+struct nested<TensorScanOp<Op, XprType>, 1,
+            typename eval<TensorScanOp<Op, XprType> >::type>
+{
+  typedef TensorScanOp<Op, XprType> type;
+};
+} // end namespace internal
+
+/** \class TensorScan
+  * \ingroup CXX11_Tensor_Module
+  *
+  * \brief Tensor scan class.
+  */
+template <typename Op, typename XprType>
+class TensorScanOp
+    : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
+public:
+  typedef typename Eigen::internal::traits<TensorScanOp>::Scalar Scalar;
+  typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename Eigen::internal::nested<TensorScanOp>::type Nested;
+  typedef typename Eigen::internal::traits<TensorScanOp>::StorageKind StorageKind;
+  typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
+      const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+      : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Index axis() const { return m_axis; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const XprType& expression() const { return m_expr; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  const Op accumulator() const { return m_accumulator; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool exclusive() const { return m_exclusive; }
+
+protected:
+  typename XprType::Nested m_expr;
+  const Index m_axis;
+  const Op m_accumulator;
+  const bool m_exclusive;
+};
+
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
+
+// Eval as rvalue
+template <typename Op, typename ArgType, typename Device>
+struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
+
+  typedef TensorScanOp<Op, ArgType> XprType;
+  typedef typename XprType::Index Index;
+  static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
+  typedef DSizes<Index, NumDims> Dimensions;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
+  typedef typename XprType::CoeffReturnType CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
+
+  enum {
+    IsAligned = false,
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
+    BlockAccess = false,
+    Layout = TensorEvaluator<ArgType, Device>::Layout,
+    CoordAccess = false,
+    RawAccess = true
+  };
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op,
+                                                        const Device& device)
+      : m_impl(op.expression(), device),
+        m_device(device),
+        m_exclusive(op.exclusive()),
+        m_accumulator(op.accumulator()),
+        m_size(m_impl.dimensions()[op.axis()]),
+        m_stride(1),
+        m_output(NULL) {
+
+    // Accumulating a scalar isn't supported.
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
+
+    // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
+    if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
+      }
+    } else {
+      for (int i = NumDims - 1; i > op.axis(); --i) {
+        m_stride = m_stride * dims[i];
+      }
+    }
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+    return m_stride;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+    return m_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+    return m_accumulator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+    return m_exclusive;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+    return m_device;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+    m_impl.evalSubExprsIfNeeded(NULL);
+    ScanLauncher<Self, Op, Device> launcher;
+    if (data) {
+      launcher(*this, data);
+      return false;
+    }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    launcher(*this, m_output);
+    return true;
+  }
+
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
+    return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const
+  {
+    return m_output;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
+  {
+    return m_output[index];
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    if (m_output != NULL) {
+      m_device.deallocate(m_output);
+      m_output = NULL;
+    }
+    m_impl.cleanup();
+  }
+
+protected:
+  TensorEvaluator<ArgType, Device> m_impl;
+  const Device& m_device;
+  const bool m_exclusive;
+  Op m_accumulator;
+  const Index m_size;
+  Index m_stride;
+  CoeffReturnType* m_output;
+};
+
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType *data) {
+    Index total_size = internal::array_prod(self.dimensions());
+
+    // We fix the index along the scan axis to 0 and perform a
+    // scan per remaining entry. The iteration is split into two nested
+    // loops to avoid an integer division by keeping track of each idx1 and idx2.
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+        // Calculate the starting offset for the scan
+        Index offset = idx1 + idx2;
+
+        // Compute the scan along the axis, starting at the calculated offset
+        typename Self::CoeffReturnType accum = self.accumulator().initialize();
+        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+          Index curr = offset + idx3 * self.stride();
+
+          if (self.exclusive()) {
+            data[curr] = self.accumulator().finalize(accum);
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+          } else {
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+            data[curr] = self.accumulator().finalize(accum);
+          }
+        }
+      }
+    }
+  }
+};
+
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+
+}
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, GpuDevice> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+     Index total_size = internal::array_prod(self.dimensions());
+     Index num_blocks = (total_size / self.size() + 63) / 64;
+     Index block_size = 64;
+     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && __CUDACC__
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
index e76533710..113c060e3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorShuffling.h
@@ -166,7 +166,7 @@ struct TensorEvaluator<const TensorShufflingOp<Shuffle, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
@@ -248,7 +248,7 @@ struct TensorEvaluator<TensorShufflingOp<Shuffle, ArgType>, Device>
   template <int StoreMode> EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     internal::pstore<CoeffReturnType, PacketReturnType>(values, x);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
index 0e89033c4..f8121d17b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStorage.h
@@ -85,7 +85,7 @@ class TensorStorage<T, DSizes<IndexType, NumIndices_>, Options_>
         : m_data(internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(size)), m_dimensions(dimensions)
       { EIGEN_INTERNAL_TENSOR_STORAGE_CTOR_PLUGIN }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
     template <typename... DenseIndex>
     EIGEN_DEVICE_FUNC TensorStorage(DenseIndex... indices) : m_dimensions(indices...) {
       m_data = internal::conditional_aligned_new_auto<T,(Options_&DontAlign)==0>(internal::array_prod(m_dimensions));
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
index 52b7d216a..6c35bfdb6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorStriding.h
@@ -164,7 +164,7 @@ struct TensorEvaluator<const TensorStridingOp<Strides, ArgType>, Device>
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
@@ -289,7 +289,7 @@ struct TensorEvaluator<TensorStridingOp<Strides, ArgType>, Device>
   template <int StoreMode> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void writePacket(Index index, const PacketReturnType& x)
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < this->dimensions().TotalSize());
 
     Index inputIndices[] = {0, 0};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index 5950f38e2..3523e7c94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -20,6 +20,7 @@ struct static_val {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
     eigen_assert(v == n);
@@ -53,7 +54,7 @@ struct TensorUInt128
   template<typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   explicit TensorUInt128(const T& x) : high(0), low(x) {
-    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
+    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
     eigen_assert(x >= 0);
   }
 
@@ -74,21 +75,21 @@ struct TensorUInt128
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator == (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   return (lhs.high == rhs.high) & (lhs.low == rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator != (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   return (lhs.high != rhs.high) | (lhs.low != rhs.low);
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (lhs.high != rhs.high) {
     return lhs.high > rhs.high;
@@ -98,7 +99,7 @@ static bool operator >= (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<H
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (lhs.high != rhs.high) {
     return lhs.high < rhs.high;
@@ -108,7 +109,7 @@ static bool operator < (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   TensorUInt128<uint64_t, uint64_t> result(lhs.high + rhs.high, lhs.low + rhs.low);
   if (result.low < rhs.low) {
@@ -119,7 +120,7 @@ static TensorUInt128<uint64_t, uint64_t> operator + (const TensorUInt128<HL, LL>
 
 template <typename HL, typename LL, typename HR, typename LR>
 EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   TensorUInt128<uint64_t, uint64_t> result(lhs.high - rhs.high, lhs.low - rhs.low);
   if (result.low > lhs.low) {
@@ -130,8 +131,8 @@ static TensorUInt128<uint64_t, uint64_t> operator - (const TensorUInt128<HL, LL>
 
 
 template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   // Split each 128-bit integer into 4 32-bit integers, and then do the
   // multiplications by hand as follow:
@@ -205,8 +206,8 @@ static TensorUInt128<uint64_t, uint64_t> operator * (const TensorUInt128<HL, LL>
 }
 
 template <typename HL, typename LL, typename HR, typename LR>
-EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-static TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
+static EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+TensorUInt128<uint64_t, uint64_t> operator / (const TensorUInt128<HL, LL>& lhs, const TensorUInt128<HR, LR>& rhs)
 {
   if (rhs == TensorUInt128<static_val<0>, static_val<1> >(1)) {
     return TensorUInt128<uint64_t, uint64_t>(lhs.high, lhs.low);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
index e735fc76f..0ca2cac84 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorVolumePatch.h
@@ -187,7 +187,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorEvaluator(const XprType& op, const Device& device)
       : m_impl(op.expression(), device)
   {
-    EIGEN_STATIC_ASSERT(NumDims >= 5, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    EIGEN_STATIC_ASSERT((NumDims >= 5), YOU_MADE_A_PROGRAMMING_MISTAKE);
 
     m_paddingValue = op.padding_value();
 
@@ -408,7 +408,7 @@ struct TensorEvaluator<const TensorVolumePatchOp<Planes, Rows, Cols, ArgType>, D
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    EIGEN_STATIC_ASSERT(PacketSize > 1, YOU_MADE_A_PROGRAMMING_MISTAKE)
+    EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
     eigen_assert(index+PacketSize-1 < dimensions().TotalSize());
 
     if (m_in_row_strides != 1 || m_in_col_strides != 1 || m_row_inflate_strides != 1 || m_col_inflate_strides != 1 ||
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
-  )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_ThreadPool_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 6dd64f185..71d55552d 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -50,7 +50,7 @@ class EventCount {
  public:
   class Waiter;
 
-  EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+  EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
     eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
     // Initialize epoch to something close to overflow to test overflow.
     state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
@@ -169,7 +169,8 @@ class EventCount {
 
   class Waiter {
     friend class EventCount;
-    std::atomic<Waiter*> next;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
     std::mutex mu;
     std::condition_variable cv;
     uint64_t epoch;
@@ -179,8 +180,6 @@ class EventCount {
       kWaiting,
       kSignaled,
     };
-    // Prevent false sharing with other Waiter objects in the same vector.
-    char pad_[128];
   };
 
  private:
@@ -200,7 +199,7 @@ class EventCount {
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
   std::atomic<uint64_t> state_;
-  std::vector<Waiter>& waiters_;
+  MaxSizeVector<Waiter>& waiters_;
 
   void Park(Waiter* w) {
     std::unique_lock<std::mutex> lock(w->mu);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 1c471a19f..354bce52a 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -23,18 +23,44 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
       : env_(env),
         threads_(num_threads),
         queues_(num_threads),
+        coprimes_(num_threads),
         waiters_(num_threads),
-        blocked_(),
-        spinning_(),
-        done_(),
+        blocked_(0),
+        spinning_(0),
+        done_(false),
         ec_(waiters_) {
-    for (int i = 0; i < num_threads; i++) queues_.push_back(new Queue());
-    for (int i = 0; i < num_threads; i++)
+    waiters_.resize(num_threads);
+
+    // Calculate coprimes of num_threads.
+    // Coprimes are used for a random walk over all threads in Steal
+    // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
+    // a walk starting thread index t and calculate num_threads - 1 subsequent
+    // indices as (t + coprime) % num_threads, we will cover all threads without
+    // repetitions (effectively getting a presudo-random permutation of thread
+    // indices).
+    for (int i = 1; i <= num_threads; i++) {
+      unsigned a = i;
+      unsigned b = num_threads;
+      // If GCD(a, b) == 1, then a and b are coprimes.
+      while (b != 0) {
+        unsigned tmp = a;
+        a = b;
+        b = tmp % b;
+      }
+      if (a == 1) {
+        coprimes_.push_back(i);
+      }
+    }
+    for (int i = 0; i < num_threads; i++) {
+      queues_.push_back(new Queue());
+    }
+    for (int i = 0; i < num_threads; i++) {
       threads_.push_back(env_.CreateThread([this, i]() { WorkerLoop(i); }));
+    }
   }
 
   ~NonBlockingThreadPoolTempl() {
-    done_.store(true, std::memory_order_relaxed);
+    done_ = true;
     // Now if all threads block without work, they will start exiting.
     // But note that threads can continue to work arbitrary long,
     // block, submit new work, unblock and otherwise live full life.
@@ -50,7 +76,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     PerThread* pt = GetPerThread();
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
-      Queue* q = queues_[pt->index];
+      Queue* q = queues_[pt->thread_id];
       t = q->PushFront(std::move(t));
     } else {
       // A free-standing thread (or worker of another pool), push onto a random
@@ -71,108 +97,111 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
       env_.ExecuteTask(t);  // Push failed, execute directly.
   }
 
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt =
+        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
  private:
   typedef typename Environment::EnvThread Thread;
 
   struct PerThread {
-    bool inited;
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
     NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    unsigned index;         // Worker thread index in pool.
-    unsigned rand;          // Random generator state.
+    uint64_t rand;  // Random generator state.
+    int thread_id;  // Worker thread index in pool.
   };
 
   Environment env_;
   MaxSizeVector<Thread*> threads_;
   MaxSizeVector<Queue*> queues_;
-  std::vector<EventCount::Waiter> waiters_;
+  MaxSizeVector<unsigned> coprimes_;
+  MaxSizeVector<EventCount::Waiter> waiters_;
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
   EventCount ec_;
 
   // Main worker thread loop.
-  void WorkerLoop(unsigned index) {
+  void WorkerLoop(int thread_id) {
     PerThread* pt = GetPerThread();
     pt->pool = this;
-    pt->index = index;
-    Queue* q = queues_[index];
-    EventCount::Waiter* waiter = &waiters_[index];
-    std::vector<Task> stolen;
+    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    pt->thread_id = thread_id;
+    Queue* q = queues_[thread_id];
+    EventCount::Waiter* waiter = &waiters_[thread_id];
     for (;;) {
-      Task t;
-      if (!stolen.empty()) {
-        t = std::move(stolen.back());
-        stolen.pop_back();
-      }
-      if (!t.f) t = q->PopFront();
+      Task t = q->PopFront();
       if (!t.f) {
-        if (Steal(&stolen)) {
-          t = std::move(stolen.back());
-          stolen.pop_back();
-          while (stolen.size()) {
-            Task t1 = q->PushFront(std::move(stolen.back()));
-            stolen.pop_back();
-            if (t1.f) {
-              // There is not much we can do in this case. Just execute the
-              // remaining directly.
-              stolen.push_back(std::move(t1));
-              break;
+        t = Steal();
+        if (!t.f) {
+          // Leave one thread spinning. This reduces latency.
+          // TODO(dvyukov): 1000 iterations is based on fair dice roll, tune it.
+          // Also, the time it takes to attempt to steal work 1000 times depends
+          // on the size of the thread pool. However the speed at which the user
+          // of the thread pool submit tasks is independent of the size of the
+          // pool. Consider a time based limit instead.
+          if (!spinning_ && !spinning_.exchange(true)) {
+            for (int i = 0; i < 1000 && !t.f; i++) {
+              t = Steal();
+            }
+            spinning_ = false;
+          }
+          if (!t.f) {
+            if (!WaitForWork(waiter, &t)) {
+              return;
             }
           }
         }
       }
       if (t.f) {
         env_.ExecuteTask(t);
-        continue;
       }
-      // Leave one thread spinning. This reduces latency.
-      if (!spinning_ && !spinning_.exchange(true)) {
-        bool nowork = true;
-        for (int i = 0; i < 1000; i++) {
-          if (!OutOfWork()) {
-            nowork = false;
-            break;
-          }
-        }
-        spinning_ = false;
-        if (!nowork) continue;
-      }
-      if (!WaitForWork(waiter)) return;
     }
   }
 
   // Steal tries to steal work from other worker threads in best-effort manner.
-  bool Steal(std::vector<Task>* stolen) {
-    if (queues_.size() == 1) return false;
+  Task Steal() {
     PerThread* pt = GetPerThread();
-    unsigned lastq = pt->index;
-    for (unsigned i = queues_.size(); i > 0; i--) {
-      unsigned victim = Rand(&pt->rand) % queues_.size();
-      if (victim == lastq && queues_.size() > 2) {
-        i++;
-        continue;
+    const size_t size = queues_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      Task t = queues_[victim]->PopBack();
+      if (t.f) {
+        return t;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
       }
-      // Steal half of elements from a victim queue.
-      // It is typical to steal just one element, but that assumes that work is
-      // recursively subdivided in halves so that the stolen element is exactly
-      // half of work. If work elements are equally-sized, then is makes sense
-      // to steal half of elements at once and then work locally for a while.
-      if (queues_[victim]->PopBackHalf(stolen)) return true;
-      lastq = victim;
     }
-    // Just to make sure that we did not miss anything.
-    for (unsigned i = queues_.size(); i > 0; i--)
-      if (queues_[i - 1]->PopBackHalf(stolen)) return true;
-    return false;
+    return Task();
   }
 
-  // WaitForWork blocks until new work is available, or if it is time to exit.
-  bool WaitForWork(EventCount::Waiter* waiter) {
-    // We already did best-effort emptiness check in Steal, so prepare blocking.
+  // WaitForWork blocks until new work is available (returns true), or if it is
+  // time to exit (returns false). Can optionally return a task to execute in t
+  // (in such case t.f != nullptr on return).
+  bool WaitForWork(EventCount::Waiter* waiter, Task* t) {
+    eigen_assert(!t->f);
+    // We already did best-effort emptiness check in Steal, so prepare for
+    // blocking.
     ec_.Prewait(waiter);
-    // Now do reliable emptiness check.
-    if (!OutOfWork()) {
+    // Now do a reliable emptiness check.
+    int victim = NonEmptyQueueIndex();
+    if (victim != -1) {
       ec_.CancelWait(waiter);
+      *t = queues_[victim]->PopBack();
       return true;
     }
     // Number of blocked threads is used as termination condition.
@@ -186,7 +215,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
       // right after incrementing blocked_ above. Now a free-standing thread
       // submits work and calls destructor (which sets done_). If we don't
       // re-check queues, we will exit leaving the work unexecuted.
-      if (!OutOfWork()) {
+      if (NonEmptyQueueIndex() != -1) {
         // Note: we must not pop from queues before we decrement blocked_,
         // otherwise the following scenario is possible. Consider that instead
         // of checking for emptiness we popped the only element from queues.
@@ -205,23 +234,36 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return true;
   }
 
-  bool OutOfWork() {
-    for (unsigned i = 0; i < queues_.size(); i++)
-      if (!queues_[i]->Empty()) return false;
-    return true;
+  int NonEmptyQueueIndex() {
+    PerThread* pt = GetPerThread();
+    const size_t size = queues_.size();
+    unsigned r = Rand(&pt->rand);
+    unsigned inc = coprimes_[r % coprimes_.size()];
+    unsigned victim = r % size;
+    for (unsigned i = 0; i < size; i++) {
+      if (!queues_[victim]->Empty()) {
+        return victim;
+      }
+      victim += inc;
+      if (victim >= size) {
+        victim -= size;
+      }
+    }
+    return -1;
   }
 
-  PerThread* GetPerThread() {
+  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
     EIGEN_THREAD_LOCAL PerThread per_thread_;
     PerThread* pt = &per_thread_;
-    if (pt->inited) return pt;
-    pt->inited = true;
-    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
     return pt;
   }
 
-  static unsigned Rand(unsigned* state) {
-    return *state = *state * 1103515245 + 12345;
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
index 0544a6e15..05ed76cbe 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/RunQueue.h
@@ -38,7 +38,7 @@ namespace Eigen {
 template <typename Work, unsigned kSize>
 class RunQueue {
  public:
-  RunQueue() : front_(), back_() {
+  RunQueue() : front_(0), back_(0) {
     // require power-of-two for fast masking
     eigen_assert((kSize & (kSize - 1)) == 0);
     eigen_assert(kSize > 2);            // why would you do this?
@@ -100,7 +100,7 @@ class RunQueue {
   // PopBack removes and returns the last elements in the queue.
   // Can fail spuriously.
   Work PopBack() {
-    if (Empty()) return 0;
+    if (Empty()) return Work();
     std::unique_lock<std::mutex> lock(mutex_, std::try_to_lock);
     if (!lock) return Work();
     unsigned back = back_.load(std::memory_order_relaxed);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 17fd1658b..e75d0f467 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
   explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
       : env_(env), threads_(num_threads), waiters_(num_threads) {
     for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
     }
   }
 
@@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
 
   // Schedule fn() for execution in the pool of threads. The functions are
   // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) {
+  void Schedule(std::function<void()> fn) final {
     Task t = env_.CreateTask(std::move(fn));
     std::unique_lock<std::mutex> l(mu_);
     if (waiters_.empty()) {
@@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
     }
   }
 
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt = this->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
  protected:
-  void WorkerLoop() {
+  void WorkerLoop(int thread_id) {
     std::unique_lock<std::mutex> l(mu_);
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->thread_id = thread_id;
     Waiter w;
     Task t;
     while (!exiting_) {
@@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
     bool ready;
   };
 
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), thread_id(-1) { }
+    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    int thread_id;                // Worker thread index in pool.
+  };
+
   Environment env_;
   std::mutex mu_;
   MaxSizeVector<Thread*> threads_;  // All threads
   MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Task> pending_;          // Queue of pending work
-  std::condition_variable empty_;          // Signaled on pending_.empty()
+  std::deque<Task> pending_;        // Queue of pending work
+  std::condition_variable empty_;   // Signaled on pending_.empty()
   bool exiting_ = false;
+
+  PerThread* GetPerThread() const {
+    EIGEN_THREAD_LOCAL PerThread per_thread;
+    return &per_thread;
+  }
 };
 
 typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d2204ad5b..399f95cc1 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -21,14 +21,14 @@ struct StlThreadEnvironment {
   // destructor must join the thread.
   class EnvThread {
    public:
-    EnvThread(std::function<void()> f) : thr_(f) {}
+    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
 
    private:
     std::thread thr_;
   };
 
-  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
   Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
   void ExecuteTask(const Task& t) { t.f(); }
 };
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 38b40aceb..a65ee97c9 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -18,6 +18,13 @@ class ThreadPoolInterface {
  public:
   virtual void Schedule(std::function<void()> fn) = 0;
 
+  // Returns the number of threads in the pool.
+  virtual int NumThreads() const = 0;
+
+  // Returns a logical thread index between 0 and NumThreads() - 1 if called
+  // from one of the threads in the pool. Returns -1 otherwise.
+  virtual int CurrentThreadId() const = 0;
+
   virtual ~ThreadPoolInterface() {}
 };
 
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/util/EmulateArray.h b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
index 24159e54c..30d3ebcff 100644
--- a/unsupported/Eigen/CXX11/src/util/EmulateArray.h
+++ b/unsupported/Eigen/CXX11/src/util/EmulateArray.h
@@ -117,7 +117,7 @@ template <typename T, size_t n> class array {
     values[7] = v8;
   }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE array(std::initializer_list<T> l) {
     eigen_assert(l.size() == n);
@@ -167,7 +167,7 @@ template <typename T> class array<T, 0> {
   EIGEN_DEVICE_FUNC
   EIGEN_STRONG_INLINE array() : dummy() { }
 
-#ifdef EIGEN_HAS_VARIADIC_TEMPLATES
+#if EIGEN_HAS_VARIADIC_TEMPLATES
   EIGEN_DEVICE_FUNC array(std::initializer_list<T> l) : dummy() {
     eigen_assert(l.size() == 0);
   }
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 961456f10..4bc3dd1ba 100644
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -55,6 +55,17 @@ class MaxSizeVector {
     internal::aligned_free(data_);
   }
 
+  void resize(size_t n) {
+    eigen_assert(n <= reserve_);
+    for (size_t i = size_; i < n; ++i) {
+      new (&data_[i]) T;
+    }
+    for (size_t i = n; i < size_; ++i) {
+      data_[i].~T();
+    }
+    size_ = n;
+  }
+
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void push_back(const T& t) {