89 files changed, 4328 insertions, 898 deletions
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 2fc8db412..631a06014 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -18,6 +18,7 @@ set(Eigen_HEADERS
   Polynomials
   Skyline 
   SparseExtra
+  SpecialFunctions
   Splines
   )
 
@@ -26,5 +27,6 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
   )
 
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
 add_subdirectory(CXX11)
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index a40bc4715..385ed240c 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -5,4 +5,4 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
   )
 
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/unsupported/Eigen/CXX11/Tensor b/unsupported/Eigen/CXX11/Tensor
index 859147404..f7b94cee1 100644
--- a/unsupported/Eigen/CXX11/Tensor
+++ b/unsupported/Eigen/CXX11/Tensor
@@ -15,6 +15,7 @@
 
 #include <Eigen/src/Core/util/DisableStupidWarnings.h>
 
+#include "../SpecialFunctions"
 #include "src/util/CXX11Meta.h"
 #include "src/util/MaxSizeVector.h"
 
@@ -80,6 +81,7 @@ typedef unsigned __int64 uint64_t;
 #include "src/Tensor/TensorTraits.h"
 #include "src/Tensor/TensorUInt128.h"
 #include "src/Tensor/TensorIntDiv.h"
+#include "src/Tensor/TensorGlobalFunctions.h"
 
 #include "src/Tensor/TensorBase.h"
 
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_Tensor_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/README.md b/unsupported/Eigen/CXX11/src/Tensor/README.md
index fda33edda..02146527b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/README.md
+++ b/unsupported/Eigen/CXX11/src/Tensor/README.md
@@ -1102,7 +1102,7 @@ Example: Reduction along two dimensions.
 
 As a special case, if you pass no parameter to a reduction operation the
 original tensor is reduced along *all* its dimensions.  The result is a
-one-dimension tensor with a single value.
+scalar, represented as a zero-dimension tensor.
 
     Eigen::Tensor<float, 3> a(2, 3, 4);
     a.setValues({{{0.0f, 1.0f, 2.0f, 3.0f},
@@ -1112,7 +1112,7 @@ one-dimension tensor with a single value.
                   {19.0f, 18.0f, 17.0f, 16.0f},
                   {20.0f, 21.0f, 22.0f, 23.0f}}});
     // Reduce along all dimensions using the sum() operator.
-    Eigen::Tensor<float, 1> b = a.sum();
+    Eigen::Tensor<float, 0> b = a.sum();
     cout << "b" << endl << b << endl << endl;
     =>
     b
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index 1eaa8d4fc..7a45a5cf4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
       return unaryExpr(internal::scalar_abs_op<Scalar>());
@@ -204,34 +210,74 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_pow_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >, const Derived>
     pow(Scalar exponent) const {
-      return unaryExpr(internal::scalar_pow_op<Scalar>(exponent));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_pow_op<Scalar,Scalar> >(exponent));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_real_op<Scalar>, const Derived>
+    real() const {
+      return unaryExpr(internal::scalar_real_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_imag_op<Scalar>, const Derived>
+    imag() const {
+      return unaryExpr(internal::scalar_imag_op<Scalar>());
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_add_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >, const Derived>
     operator+ (Scalar rhs) const {
-      return unaryExpr(internal::scalar_add_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_sum_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_sub_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_sum_op<Scalar> >, const Derived>
+    operator+ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_sum_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >, const Derived>
     operator- (Scalar rhs) const {
       EIGEN_STATIC_ASSERT((NumTraits<Scalar>::IsSigned || internal::is_same<Scalar, const std::complex<float> >::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
-      return unaryExpr(internal::scalar_sub_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_difference_op<Scalar,Scalar> >(rhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_difference_op<Scalar> >, const Derived>
+    operator- (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_difference_op<Scalar> >(lhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >, const Derived>
     operator* (Scalar rhs) const {
-      return unaryExpr(internal::scalar_multiple_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_product_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_product_op<Scalar> >, const Derived>
+    operator* (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_product_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_quotient1_op<Scalar>, const Derived>
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >, const Derived>
     operator/ (Scalar rhs) const {
-      return unaryExpr(internal::scalar_quotient1_op<Scalar>(rhs));
+      return unaryExpr(internal::bind2nd_op<internal::scalar_quotient_op<Scalar,Scalar> >(rhs));
+    }
+
+    EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE friend
+    const TensorCwiseUnaryOp<internal::bind1st_op<internal::scalar_quotient_op<Scalar> >, const Derived>
+    operator/ (Scalar lhs, const Derived& rhs) {
+      return rhs.unaryExpr(internal::bind1st_op<internal::scalar_quotient_op<Scalar> >(lhs));
     }
 
     EIGEN_DEVICE_FUNC
@@ -277,7 +323,6 @@ class TensorBase<Derived, ReadOnlyAccessors>
       return unaryExpr(internal::scalar_floor_op<Scalar>());
     }
 
-
     // Generic binary operation support.
     template <typename CustomBinaryOp, typename OtherDerived> EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<CustomBinaryOp, const Derived, const OtherDerived>
@@ -342,66 +387,66 @@ class TensorBase<Derived, ReadOnlyAccessors>
 
     // Comparisons and tests.
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const OtherDerived>
     operator<(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const OtherDerived>
     operator<=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_LE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const OtherDerived>
     operator>(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GT>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>());
     }
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const OtherDerived>
     operator>=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_GE>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const OtherDerived>
     operator==(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_EQ>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>());
     }
 
     template<typename OtherDerived> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
-    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
+    const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const OtherDerived>
     operator!=(const OtherDerived& other) const {
-      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>());
+      return binaryExpr(other.derived(), internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>());
     }
 
     // comparisons and tests for Scalars
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<(Scalar threshold) const {
       return operator<(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_LE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator<=(Scalar threshold) const {
       return operator<=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GT>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>(Scalar threshold) const {
       return operator>(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_GE>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator>=(Scalar threshold) const {
       return operator>=(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_EQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator==(Scalar threshold) const {
       return operator==(constant(threshold));
     }
     EIGEN_DEVICE_FUNC
-    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
+    EIGEN_STRONG_INLINE const TensorCwiseBinaryOp<internal::scalar_cmp_op<Scalar, Scalar, internal::cmp_NEQ>, const Derived, const TensorCwiseNullaryOp<internal::scalar_constant_op<Scalar>, const Derived> >
     operator!=(Scalar threshold) const {
       return operator!=(constant(threshold));
     }
@@ -457,15 +502,22 @@ class TensorBase<Derived, ReadOnlyAccessors>
     typedef TensorScanOp<internal::SumReducer<CoeffReturnType>, const Derived> TensorScanSumOp;
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorScanSumOp
-    cumsum(const Index& axis) const {
-      return TensorScanSumOp(derived(), axis);
+    cumsum(const Index& axis, bool exclusive = false) const {
+      return TensorScanSumOp(derived(), axis, exclusive);
     }
 
     typedef TensorScanOp<internal::ProdReducer<CoeffReturnType>, const Derived> TensorScanProdOp;
     EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
     const TensorScanProdOp
-    cumprod(const Index& axis) const {
-      return TensorScanProdOp(derived(), axis);
+    cumprod(const Index& axis, bool exclusive = false) const {
+      return TensorScanProdOp(derived(), axis, exclusive);
+    }
+
+    template <typename Reducer>
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+    const TensorScanOp<Reducer, const Derived>
+    scan(const Index& axis, const Reducer& reducer, bool exclusive = false) const {
+      return TensorScanOp<Reducer, const Derived>(derived(), axis, exclusive, reducer);
     }
 
     // Reductions.
@@ -771,8 +823,8 @@ class TensorBase<Derived, ReadOnlyAccessors>
     EIGEN_STRONG_INLINE const Derived& derived() const { return *static_cast<const Derived*>(this); }
 };
 
-template<typename Derived>
-class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyAccessors> {
+template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value>
+class TensorBase : public TensorBase<Derived, ReadOnlyAccessors> {
  public:
     typedef internal::traits<Derived> DerivedTraits;
     typedef typename DerivedTraits::Scalar Scalar;
@@ -782,7 +834,7 @@ class TensorBase<Derived, WriteAccessors> : public TensorBase<Derived, ReadOnlyA
 
     template <typename Scalar, int NumIndices, int Options, typename IndexType> friend class Tensor;
     template <typename Scalar, typename Dimensions, int Option, typename IndexTypes> friend class TensorFixedSize;
-    template <typename OtherDerived, int AccessLevel> friend class TensorBase;
+    template <typename OtherDerived, int OtherAccessLevel> friend class TensorBase;
 
     EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE Derived& setZero() {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 56d9c2025..a6001074b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,8 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
-                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename gebp_traits<typename LhsXprType::Scalar, typename RhsXprType::Scalar>::ResScalar Scalar;
+
   typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                         typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -75,8 +75,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
-                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
index 886474986..d65dbb40f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionCuda.h
@@ -461,8 +461,8 @@ EigenContractionKernelInternal(const LhsMapper lhs, const RhsMapper rhs,
 #undef writeResultShmem
 #undef writeRow
 
-  const int max_i_write = (min)((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
-  const int max_j_write = (min)((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
+  const int max_i_write = numext::mini((int)((m_size - base_m - threadIdx.y + 7) / 8), 8);
+  const int max_j_write = numext::mini((int)((n_size - base_n - threadIdx.z + 7) / 8), 8);
 
   if (threadIdx.x < max_i_write) {
     if (max_j_write == 8) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
index b27e1a1b4..9b2cb3ff6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionMapper.h
@@ -130,19 +130,19 @@ class SimpleTensorContractionMapper {
     }
 
     Index contract_val = left ? col : row;
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx = contract_val / m_k_strides[i];
-      linidx += idx * m_contract_strides[i];
-      contract_val -= idx * m_k_strides[i];
-    }
-
     if(array_size<contract_t>::value > 0) {
-        if (side == Rhs && inner_dim_contiguous) {
-            eigen_assert(m_contract_strides[0] == 1);
-            linidx += contract_val;
-        } else {
-            linidx += contract_val * m_contract_strides[0];
-        }
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx = contract_val / m_k_strides[i];
+        linidx += idx * m_contract_strides[i];
+        contract_val -= idx * m_k_strides[i];
+      }
+
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx += contract_val;
+      } else {
+        linidx += contract_val * m_contract_strides[0];
+      }
     }
 
     return linidx;
@@ -153,15 +153,15 @@ class SimpleTensorContractionMapper {
     const bool left = (side == Lhs);
     Index nocontract_val[2] = {left ? row : col, left ? row + distance : col};
     Index linidx[2] = {0, 0};
-    for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = nocontract_val[0] / m_ij_strides[i];
-      const Index idx1 = nocontract_val[1] / m_ij_strides[i];
-      linidx[0] += idx0 * m_nocontract_strides[i];
-      linidx[1] += idx1 * m_nocontract_strides[i];
-      nocontract_val[0] -= idx0 * m_ij_strides[i];
-      nocontract_val[1] -= idx1 * m_ij_strides[i];
-    }
     if (array_size<typename Tensor::Dimensions>::value > array_size<contract_t>::value) {
+      for (int i = static_cast<int>(array_size<nocontract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = nocontract_val[0] / m_ij_strides[i];
+        const Index idx1 = nocontract_val[1] / m_ij_strides[i];
+        linidx[0] += idx0 * m_nocontract_strides[i];
+        linidx[1] += idx1 * m_nocontract_strides[i];
+        nocontract_val[0] -= idx0 * m_ij_strides[i];
+        nocontract_val[1] -= idx1 * m_ij_strides[i];
+      }
       if (side == Lhs && inner_dim_contiguous) {
         eigen_assert(m_nocontract_strides[0] == 1);
         linidx[0] += nocontract_val[0];
@@ -173,22 +173,24 @@ class SimpleTensorContractionMapper {
     }
 
     Index contract_val[2] = {left ? col : row, left ? col : row + distance};
-    for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
-      const Index idx0 = contract_val[0] / m_k_strides[i];
-      const Index idx1 = contract_val[1] / m_k_strides[i];
-      linidx[0] += idx0 * m_contract_strides[i];
-      linidx[1] += idx1 * m_contract_strides[i];
-      contract_val[0] -= idx0 * m_k_strides[i];
-      contract_val[1] -= idx1 * m_k_strides[i];
-    }
+    if (array_size<contract_t>::value> 0) {
+      for (int i = static_cast<int>(array_size<contract_t>::value) - 1; i > 0; i--) {
+        const Index idx0 = contract_val[0] / m_k_strides[i];
+        const Index idx1 = contract_val[1] / m_k_strides[i];
+        linidx[0] += idx0 * m_contract_strides[i];
+        linidx[1] += idx1 * m_contract_strides[i];
+        contract_val[0] -= idx0 * m_k_strides[i];
+        contract_val[1] -= idx1 * m_k_strides[i];
+      }
 
-    if (side == Rhs && inner_dim_contiguous) {
-      eigen_assert(m_contract_strides[0] == 1);
-      linidx[0] += contract_val[0];
-      linidx[1] += contract_val[1];
-    } else {
-      linidx[0] += contract_val[0] * m_contract_strides[0];
-      linidx[1] += contract_val[1] * m_contract_strides[0];
+      if (side == Rhs && inner_dim_contiguous) {
+        eigen_assert(m_contract_strides[0] == 1);
+        linidx[0] += contract_val[0];
+        linidx[1] += contract_val[1];
+      } else {
+        linidx[0] += contract_val[0] * m_contract_strides[0];
+        linidx[1] += contract_val[1] * m_contract_strides[0];
+      }
     }
     return IndexPair<Index>(linidx[0], linidx[1]);
   }
@@ -200,7 +202,7 @@ class SimpleTensorContractionMapper {
     return (Alignment == Aligned) && (side == Lhs) && inner_dim_contiguous ? 0 : size;
   }
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE Index stride() const {
-    return ((side == Lhs) && inner_dim_contiguous) ? m_contract_strides[0] : 1;
+    return ((side == Lhs) && inner_dim_contiguous && array_size<contract_t>::value > 0) ? m_contract_strides[0] : 1;
   }
 
  protected:
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index a60a17049..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // across k dimension.
     const TensorOpCost cost =
         contractionCost(m, n, bm, bn, bk, shard_by_col, false);
-    Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         static_cast<double>(n) * m, cost, this->m_device.numThreads());
 
     // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   class Context {
    public:
     Context(const Device& device, int num_threads, LhsMapper& lhs,
-            RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
             Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
             Index gn, Index nm0, Index nn0, bool shard_by_col,
             bool parallel_pack)
@@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           lhs_(lhs),
           rhs_(rhs),
           buffer_(buffer),
-          output_(buffer, m),
+          output_(buffer, tm),
           num_threads_(num_threads),
           shard_by_col_(shard_by_col),
           parallel_pack_(parallel_pack),
-          m_(m),
-          n_(n),
-          k_(k),
+          m_(tm),
+          n_(tn),
+          k_(tk),
           bm_(bm),
           bn_(bn),
           bk_(bk),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index a76c8ca35..d66e45d50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -91,21 +91,21 @@ class TensorOpCost {
   }
 
   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
index 6c12b2ed8..1468caa23 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceCuda.h
@@ -12,6 +12,8 @@
 
 namespace Eigen {
 
+static const int kCudaScratchSize = 1024;
+
 // This defines an interface that GPUDevice can take to use
 // CUDA streams underneath.
 class StreamInterface {
@@ -27,6 +29,12 @@ class StreamInterface {
 
   // Return a scratchpad buffer of size 1k
   virtual void* scratchpad() const = 0;
+
+  // Return a semaphore. The semaphore is initially initialized to 0, and
+  // each kernel using it is responsible for resetting to 0 upon completion
+  // to maintain the invariant that the semaphore is always equal to 0 upon
+  // each kernel start.
+  virtual unsigned int* semaphore() const = 0;
 };
 
 static cudaDeviceProp* m_deviceProperties;
@@ -65,12 +73,12 @@ static const cudaStream_t default_stream = cudaStreamDefault;
 class CudaStreamDevice : public StreamInterface {
  public:
   // Use the default stream on the current device
-  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL) {
+  CudaStreamDevice() : stream_(&default_stream), scratch_(NULL), semaphore_(NULL) {
     cudaGetDevice(&device_);
     initializeDeviceProp();
   }
   // Use the default stream on the specified device
-  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL) {
+  CudaStreamDevice(int device) : stream_(&default_stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     initializeDeviceProp();
   }
   // Use the specified stream. Note that it's the
@@ -78,7 +86,7 @@ class CudaStreamDevice : public StreamInterface {
   // the specified device. If no device is specified the code
   // assumes that the stream is associated to the current gpu device.
   CudaStreamDevice(const cudaStream_t* stream, int device = -1)
-      : stream_(stream), device_(device), scratch_(NULL) {
+      : stream_(stream), device_(device), scratch_(NULL), semaphore_(NULL) {
     if (device < 0) {
       cudaGetDevice(&device_);
     } else {
@@ -123,15 +131,27 @@ class CudaStreamDevice : public StreamInterface {
 
   virtual void* scratchpad() const {
     if (scratch_ == NULL) {
-      scratch_ = allocate(1024);
+      scratch_ = allocate(kCudaScratchSize + sizeof(unsigned int));
     }
     return scratch_;
   }
 
+  virtual unsigned int* semaphore() const {
+    if (semaphore_ == NULL) {
+      char* scratch = static_cast<char*>(scratchpad()) + kCudaScratchSize;
+      semaphore_ = reinterpret_cast<unsigned int*>(scratch);
+      cudaError_t err = cudaMemsetAsync(semaphore_, 0, sizeof(unsigned int), *stream_);
+      EIGEN_UNUSED_VARIABLE(err)
+      assert(err == cudaSuccess);
+    }
+    return semaphore_;
+  }
+
  private:
   const cudaStream_t* stream_;
   int device_;
   mutable void* scratch_;
+  mutable unsigned int* semaphore_;
 };
 
 struct GpuDevice {
@@ -174,6 +194,15 @@ struct GpuDevice {
 #endif
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE unsigned int* semaphore() const {
+#ifndef __CUDA_ARCH__
+    return stream_->semaphore();
+#else
+    eigen_assert(false && "The default device should be used instead to generate kernel code");
+    return NULL;
+#endif
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void memcpy(void* dst, const void* src, size_t n) const {
 #ifndef __CUDA_ARCH__
     cudaError_t err = cudaMemcpyAsync(dst, src, n, cudaMemcpyDeviceToDevice,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index d31b0ad38..069680a11 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
     ::memset(buffer, c, n);
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE int numThreads() const {
     return num_threads_;
   }
 
@@ -151,9 +151,7 @@ struct ThreadPoolDevice {
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE Notification* enqueue(Function&& f, Args&&... args) const {
     Notification* n = new Notification();
-    std::function<void()> func =
-      std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(&FunctionWrapperWithNotification<Function, Args...>::run, n, f, args...));
     return n;
   }
 
@@ -161,15 +159,19 @@ struct ThreadPoolDevice {
   EIGEN_STRONG_INLINE void enqueue_with_barrier(Barrier* b,
                                                 Function&& f,
                                                 Args&&... args) const {
-    std::function<void()> func = std::bind(
-        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(
+        &FunctionWrapperWithBarrier<Function, Args...>::run, b, f, args...));
   }
 
   template <class Function, class... Args>
   EIGEN_STRONG_INLINE void enqueueNoNotification(Function&& f, Args&&... args) const {
-    std::function<void()> func = std::bind(f, args...);
-    pool_->Schedule(func);
+    pool_->Schedule(std::bind(f, args...));
+  }
+
+  // Returns a logical thread index between 0 and pool_->NumThreads() - 1 if
+  // called from one of the threads in pool_. Returns -1 otherwise.
+  EIGEN_STRONG_INLINE int currentThreadId() const {
+    return pool_->CurrentThreadId();
   }
 
   // parallelFor executes f with [0, n) arguments in parallel and waits for
@@ -182,7 +184,7 @@ struct ThreadPoolDevice {
                    std::function<void(Index, Index)> f) const {
     typedef TensorCostModel<ThreadPoolDevice> CostModel;
     if (n <= 1 || numThreads() == 1 ||
-        CostModel::numThreads(n, cost, numThreads()) == 1) {
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
       return;
     }
@@ -242,7 +244,7 @@ struct ThreadPoolDevice {
     // Recursively divide size into halves until we reach block_size.
     // Division code rounds mid to block_size, so we are guaranteed to get
     // block_count leaves that do actual computations.
-    Barrier barrier(block_count);
+    Barrier barrier(static_cast<unsigned int>(block_count));
     std::function<void(Index, Index)> handleRange;
     handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
       if (last - first <= block_size) {
@@ -268,7 +270,7 @@ struct ThreadPoolDevice {
 
  private:
   ThreadPoolInterface* pool_;
-  size_t num_threads_;
+  int num_threads_;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
index 26b1f65a8..a08dfa7c3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvalTo.h
@@ -94,7 +94,7 @@ struct TensorEvaluator<const TensorEvalToOp<ArgType>, Device>
   static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
 
   enum {
-    IsAligned = true,
+    IsAligned = TensorEvaluator<ArgType, Device>::IsAligned,
     PacketAccess = TensorEvaluator<ArgType, Device>::PacketAccess,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,  // to be implemented
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index 31b361c83..33ffaa600 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -131,7 +131,7 @@ double loadConstant(const double* address) {
 }
 template <> EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
 Eigen::half loadConstant(const Eigen::half* address) {
-  return Eigen::half(internal::raw_uint16_to_half(__ldg(&address->x)));
+  return Eigen::half(half_impl::raw_uint16_to_half(__ldg(&address->x)));
 }
 #endif
 }
@@ -403,6 +403,101 @@ struct TensorEvaluator<const TensorCwiseBinaryOp<BinaryOp, LeftArgType, RightArg
   TensorEvaluator<RightArgType, Device> m_rightImpl;
 };
 
+// -------------------- CwiseTernaryOp --------------------
+
+template<typename TernaryOp, typename Arg1Type, typename Arg2Type, typename Arg3Type, typename Device>
+struct TensorEvaluator<const TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type>, Device>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1Type, Arg2Type, Arg3Type> XprType;
+
+  enum {
+    IsAligned = TensorEvaluator<Arg1Type, Device>::IsAligned & TensorEvaluator<Arg2Type, Device>::IsAligned & TensorEvaluator<Arg3Type, Device>::IsAligned,
+    PacketAccess = TensorEvaluator<Arg1Type, Device>::PacketAccess & TensorEvaluator<Arg2Type, Device>::PacketAccess & TensorEvaluator<Arg3Type, Device>::PacketAccess &
+                   internal::functor_traits<TernaryOp>::PacketAccess,
+    Layout = TensorEvaluator<Arg1Type, Device>::Layout,
+    CoordAccess = false,  // to be implemented
+    RawAccess = false
+  };
+
+  EIGEN_DEVICE_FUNC TensorEvaluator(const XprType& op, const Device& device)
+    : m_functor(op.functor()),
+      m_arg1Impl(op.arg1Expression(), device),
+      m_arg2Impl(op.arg2Expression(), device),
+      m_arg3Impl(op.arg3Expression(), device)
+  {
+    EIGEN_STATIC_ASSERT((static_cast<int>(TensorEvaluator<Arg1Type, Device>::Layout) == static_cast<int>(TensorEvaluator<Arg3Type, Device>::Layout) || internal::traits<XprType>::NumDimensions <= 1), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg2Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::StorageKind,
+                         typename internal::traits<Arg3Type>::StorageKind>::value),
+                        STORAGE_KIND_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg2Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+    EIGEN_STATIC_ASSERT((internal::is_same<typename internal::traits<Arg1Type>::Index,
+                         typename internal::traits<Arg3Type>::Index>::value),
+                        STORAGE_INDEX_MUST_MATCH)
+
+    eigen_assert(dimensions_match(m_arg1Impl.dimensions(), m_arg2Impl.dimensions()) && dimensions_match(m_arg1Impl.dimensions(), m_arg3Impl.dimensions()));
+  }
+
+  typedef typename XprType::Index Index;
+  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::traits<XprType>::Scalar CoeffReturnType;
+  typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  static const int PacketSize = internal::unpacket_traits<PacketReturnType>::size;
+  typedef typename TensorEvaluator<Arg1Type, Device>::Dimensions Dimensions;
+
+  EIGEN_DEVICE_FUNC const Dimensions& dimensions() const
+  {
+    // TODO: use arg2 or arg3 dimensions if they are known at compile time.
+    return m_arg1Impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
+    m_arg1Impl.evalSubExprsIfNeeded(NULL);
+    m_arg2Impl.evalSubExprsIfNeeded(NULL);
+    m_arg3Impl.evalSubExprsIfNeeded(NULL);
+    return true;
+  }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
+    m_arg1Impl.cleanup();
+    m_arg2Impl.cleanup();
+    m_arg3Impl.cleanup();
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
+  {
+    return m_functor(m_arg1Impl.coeff(index), m_arg2Impl.coeff(index), m_arg3Impl.coeff(index));
+  }
+  template<int LoadMode>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
+  {
+    return m_functor.packetOp(m_arg1Impl.template packet<LoadMode>(index),
+                              m_arg2Impl.template packet<LoadMode>(index),
+                              m_arg3Impl.template packet<LoadMode>(index));
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
+  costPerCoeff(bool vectorized) const {
+    const double functor_cost = internal::functor_traits<TernaryOp>::Cost;
+    return m_arg1Impl.costPerCoeff(vectorized) +
+           m_arg2Impl.costPerCoeff(vectorized) +
+           m_arg3Impl.costPerCoeff(vectorized) +
+           TensorOpCost(0, 0, functor_cost, vectorized, PacketSize);
+  }
+
+  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+
+ private:
+  const TernaryOp m_functor;
+  TensorEvaluator<Arg1Type, Device> m_arg1Impl;
+  TensorEvaluator<Arg1Type, Device> m_arg2Impl;
+  TensorEvaluator<Arg3Type, Device> m_arg3Impl;
+};
+
 
 // -------------------- SelectOp --------------------
 
@@ -479,7 +574,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
 
  private:
   TensorEvaluator<IfArgType, Device> m_condImpl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
index ad5c97b57..a116bf17f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExecutor.h
@@ -159,7 +159,6 @@ class TensorExecutor<Expression, ThreadPoolDevice, Vectorizable> {
 #else
       size_t num_threads = device.numThreads();
       if (num_threads > 1) {
-        cost = evaluator.costPerCoeff(Vectorizable)
         num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
             size, evaluator.costPerCoeff(Vectorizable), num_threads);
       }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
index ea250d8bc..5f2e329f2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorExpr.h
@@ -219,6 +219,86 @@ class TensorCwiseBinaryOp : public TensorBase<TensorCwiseBinaryOp<BinaryOp, LhsX
 
 
 namespace internal {
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct traits<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >
+{
+  // Type promotion to handle the case where the types of the args are different.
+  typedef typename result_of<
+      TernaryOp(typename Arg1XprType::Scalar,
+                typename Arg2XprType::Scalar,
+                typename Arg3XprType::Scalar)>::type Scalar;
+  typedef traits<Arg1XprType> XprTraits;
+  typedef typename traits<Arg1XprType>::StorageKind StorageKind;
+  typedef typename traits<Arg1XprType>::Index Index;
+  typedef typename Arg1XprType::Nested Arg1Nested;
+  typedef typename Arg2XprType::Nested Arg2Nested;
+  typedef typename Arg3XprType::Nested Arg3Nested;
+  typedef typename remove_reference<Arg1Nested>::type _Arg1Nested;
+  typedef typename remove_reference<Arg2Nested>::type _Arg2Nested;
+  typedef typename remove_reference<Arg3Nested>::type _Arg3Nested;
+  static const int NumDimensions = XprTraits::NumDimensions;
+  static const int Layout = XprTraits::Layout;
+
+  enum {
+    Flags = 0
+  };
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, Eigen::Dense>
+{
+  typedef const TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>& type;
+};
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+struct nested<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, 1, typename eval<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> >::type>
+{
+  typedef TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType> type;
+};
+
+}  // end namespace internal
+
+
+
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType>
+class TensorCwiseTernaryOp : public TensorBase<TensorCwiseTernaryOp<TernaryOp, Arg1XprType, Arg2XprType, Arg3XprType>, ReadOnlyAccessors>
+{
+  public:
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Scalar Scalar;
+    typedef typename Eigen::NumTraits<Scalar>::Real RealScalar;
+    typedef Scalar CoeffReturnType;
+    typedef typename Eigen::internal::nested<TensorCwiseTernaryOp>::type Nested;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::StorageKind StorageKind;
+    typedef typename Eigen::internal::traits<TensorCwiseTernaryOp>::Index Index;
+
+    EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorCwiseTernaryOp(const Arg1XprType& arg1, const Arg2XprType& arg2, const Arg3XprType& arg3, const TernaryOp& func = TernaryOp())
+        : m_arg1_xpr(arg1), m_arg2_xpr(arg2), m_arg3_xpr(arg3), m_functor(func) {}
+
+    EIGEN_DEVICE_FUNC
+    const TernaryOp& functor() const { return m_functor; }
+
+    /** \returns the nested expressions */
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    arg1Expression() const { return m_arg1_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg1XprType::Nested>::type&
+    arg2Expression() const { return m_arg2_xpr; }
+
+    EIGEN_DEVICE_FUNC
+    const typename internal::remove_all<typename Arg3XprType::Nested>::type&
+    arg3Expression() const { return m_arg3_xpr; }
+
+  protected:
+    typename Arg1XprType::Nested m_arg1_xpr;
+    typename Arg1XprType::Nested m_arg2_xpr;
+    typename Arg3XprType::Nested m_arg3_xpr;
+    const TernaryOp m_functor;
+};
+
+
+namespace internal {
 template<typename IfXprType, typename ThenXprType, typename ElseXprType>
 struct traits<TensorSelectOp<IfXprType, ThenXprType, ElseXprType> >
     : traits<ThenXprType>
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
index ece2ed91b..08eb5595a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFFT.h
@@ -329,7 +329,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
     for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
-        a[i] = data[i] * std::conj(pos_j_base_powered[i]);
+        a[i] = data[i] * numext::conj(pos_j_base_powered[i]);
       }
       else {
         a[i] = data[i] * pos_j_base_powered[i];
@@ -344,7 +344,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         b[i] = pos_j_base_powered[i];
       }
       else {
-        b[i] = std::conj(pos_j_base_powered[i]);
+        b[i] = numext::conj(pos_j_base_powered[i]);
       }
     }
     for (Index i = n; i < m - n; ++i) {
@@ -355,7 +355,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
         b[i] = pos_j_base_powered[m-i];
       }
       else {
-        b[i] = std::conj(pos_j_base_powered[m-i]);
+        b[i] = numext::conj(pos_j_base_powered[m-i]);
       }
     }
 
@@ -379,7 +379,7 @@ struct TensorEvaluator<const TensorFFTOp<FFT, ArgType, FFTResultType, FFTDir>, D
 
     for (Index i = 0; i < n; ++i) {
       if(FFTDir == FFT_FORWARD) {
-        data[i] = a[i] * std::conj(pos_j_base_powered[i]);
+        data[i] = a[i] * numext::conj(pos_j_base_powered[i]);
       }
       else {
         data[i] = a[i] * pos_j_base_powered[i];
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
index 5d0548b84..c23ecdbc4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForcedEval.h
@@ -102,7 +102,7 @@ struct TensorEvaluator<const TensorForcedEvalOp<ArgType>, Device>
   EIGEN_DEVICE_FUNC const Dimensions& dimensions() const { return m_impl.dimensions(); }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(CoeffReturnType*) {
-    const Index numValues = m_impl.dimensions().TotalSize();
+    const Index numValues =  internal::array_prod(m_impl.dimensions());
     m_buffer = (CoeffReturnType*)m_device.allocate(numValues * sizeof(CoeffReturnType));
     // Should initialize the memory in case we're dealing with non POD types.
     if (NumTraits<CoeffReturnType>::RequireInitialization) {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
index a1a18d938..490ddd8bd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorForwardDeclarations.h
@@ -16,11 +16,12 @@ template<typename Scalar_, int NumIndices_, int Options_ = 0, typename IndexType
 template<typename Scalar_, typename Dimensions, int Options_ = 0, typename IndexType = DenseIndex> class TensorFixedSize;
 template<typename PlainObjectType, int Options_ = Unaligned> class TensorMap;
 template<typename PlainObjectType> class TensorRef;
-template<typename Derived, int AccessLevel = internal::accessors_level<Derived>::value> class TensorBase;
+template<typename Derived, int AccessLevel> class TensorBase;
 
 template<typename NullaryOp, typename PlainObjectType> class TensorCwiseNullaryOp;
 template<typename UnaryOp, typename XprType> class TensorCwiseUnaryOp;
 template<typename BinaryOp, typename LeftXprType, typename RightXprType> class TensorCwiseBinaryOp;
+template<typename TernaryOp, typename Arg1XprType, typename Arg2XprType, typename Arg3XprType> class TensorCwiseTernaryOp;
 template<typename IfXprType, typename ThenXprType, typename ElseXprType> class TensorSelectOp;
 template<typename Op, typename Dims, typename XprType> class TensorReductionOp;
 template<typename XprType> class TensorIndexTupleOp;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3dd32e9d1..a8e48fced 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
 };
 
 
+template<typename Reducer, typename Device>
+struct reducer_traits {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
@@ -119,6 +127,15 @@ template <typename T> struct SumReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
 template <typename T> struct MeanReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
     DenseIndex packetCount_;
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasAdd
+  };
+};
+
+
 template <typename T> struct MaxReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMax
+  };
+};
+
+
 template <typename T> struct MinReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMin;
@@ -228,6 +263,14 @@ template <typename T> struct MinReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::HasMin
+  };
+};
+
 
 template <typename T> struct ProdReducer
 {
@@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::HasMul
+  };
+};
+
 
 struct AndReducer
 {
@@ -280,6 +331,15 @@ struct AndReducer
   }
 };
 
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 struct OrReducer {
   static const bool PacketAccess = false;
   static const bool IsStateful = false;
@@ -295,6 +355,15 @@ struct OrReducer {
   }
 };
 
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
@@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
 template <typename T> struct ArgMinTupleReducer
 {
   static const bool PacketAccess = false;
@@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
 
 // Random number generation
 namespace {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
new file mode 100644
index 000000000..665b861cf
--- /dev/null
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGlobalFunctions.h
@@ -0,0 +1,33 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+#define EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given tensors.
+ *
+ * This function computes the regularized incomplete beta function (integral).
+ *
+ */
+template <typename ADerived, typename BDerived, typename XDerived>
+EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const
+    TensorCwiseTernaryOp<internal::scalar_betainc_op<typename XDerived::Scalar>,
+                         const ADerived, const BDerived, const XDerived>
+    betainc(const ADerived& a, const BDerived& b, const XDerived& x) {
+  return TensorCwiseTernaryOp<
+      internal::scalar_betainc_op<typename XDerived::Scalar>, const ADerived,
+      const BDerived, const XDerived>(
+      a, b, x, internal::scalar_betainc_op<typename XDerived::Scalar>());
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_CXX11_TENSOR_TENSOR_GLOBAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
index 38a833f82..a901c5dd4 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIO.h
@@ -13,38 +13,61 @@
 namespace Eigen {
 
 namespace internal {
-template<>
-struct significant_decimals_impl<std::string>
-    : significant_decimals_default_impl<std::string, true>
-{};
-}
 
+// Print the tensor as a 2d matrix
+template <typename Tensor, int Rank>
+struct TensorPrinter {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
+      static const int layout = Tensor::Layout;
+      Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
+      os << matrix;
+    }
+  }
+};
+
+
+// Print the tensor as a vector
+template <typename Tensor>
+struct TensorPrinter<Tensor, 1> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    typedef typename internal::remove_const<typename Tensor::Scalar>::type Scalar;
+    typedef typename Tensor::Index Index;
+    const Index total_size = internal::array_prod(tensor.dimensions());
+    if (total_size > 0) {
+      Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
+      os << array;
+    }
+  }
+};
+
+
+// Print the tensor as a scalar
+template <typename Tensor>
+struct TensorPrinter<Tensor, 0> {
+  static void run (std::ostream& os, const Tensor& tensor) {
+    os << tensor.coeff(0);
+  }
+};
+}
 
 template <typename T>
 std::ostream& operator << (std::ostream& os, const TensorBase<T, ReadOnlyAccessors>& expr) {
+  typedef TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> Evaluator;
+  typedef typename Evaluator::Dimensions Dimensions;
+
   // Evaluate the expression if needed
   TensorForcedEvalOp<const T> eval = expr.eval();
-  TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice> tensor(eval, DefaultDevice());
+  Evaluator tensor(eval, DefaultDevice());
   tensor.evalSubExprsIfNeeded(NULL);
 
-  typedef typename internal::remove_const<typename T::Scalar>::type Scalar;
-  typedef typename T::Index Index;
-  typedef typename TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Dimensions Dimensions;
-  const Index total_size = internal::array_prod(tensor.dimensions());
-
-  // Print the tensor as a 1d vector or a 2d matrix.
+  // Print the result
   static const int rank = internal::array_size<Dimensions>::value;
-  if (rank == 0) {
-    os << tensor.coeff(0);
-  } else if (rank == 1) {
-    Map<const Array<Scalar, Dynamic, 1> > array(const_cast<Scalar*>(tensor.data()), total_size);
-    os << array;
-  } else {
-    const Index first_dim = Eigen::internal::array_get<0>(tensor.dimensions());
-    static const int layout = TensorEvaluator<const TensorForcedEvalOp<const T>, DefaultDevice>::Layout;
-    Map<const Array<Scalar, Dynamic, Dynamic, layout> > matrix(const_cast<Scalar*>(tensor.data()), first_dim, total_size/first_dim);
-    os << matrix;
-  }
+  internal::TensorPrinter<Evaluator, rank>::run(os, tensor);
 
   // Cleanup.
   tensor.cleanup();
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
index 33c6c1b0f..ede3939c2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorIntDiv.h
@@ -29,25 +29,47 @@ namespace Eigen {
 namespace internal {
 
 namespace {
+
   // Note: result is undefined if val == 0
   template <typename T>
-  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE int count_leading_zeros(const T val)
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==4,int>::type count_leading_zeros(const T val)
   {
 #ifdef __CUDA_ARCH__
-    return (sizeof(T) == 8) ? __clzll(val) : __clz(val);
+    return __clz(val);
 #elif EIGEN_COMP_MSVC
-	unsigned long index;
-	if (sizeof(T) == 8) {
-      _BitScanReverse64(&index, val);
-    } else {
-      _BitScanReverse(&index, val);
-    }
-    return (sizeof(T) == 8) ? 63 - index : 31 - index;
+    unsigned long index;
+    _BitScanReverse(&index, val);
+    return 31 - index;
+#else
+    EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
+    return __builtin_clz(static_cast<uint32_t>(val));
+#endif
+  }
+
+  template <typename T>
+  EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
+  typename internal::enable_if<sizeof(T)==8,int>::type count_leading_zeros(const T val)
+  {
+#ifdef __CUDA_ARCH__
+    return __clzll(val);
+#elif EIGEN_COMP_MSVC && EIGEN_ARCH_x86_64
+    unsigned long index;
+    _BitScanReverse64(&index, val);
+    return 63 - index;
+#elif EIGEN_COMP_MSVC
+    // MSVC's _BitScanReverse64 is not available for 32bits builds.
+    unsigned int lo = (unsigned int)(val&0xffffffff);
+    unsigned int hi = (unsigned int)((val>>32)&0xffffffff);
+    int n;
+    if(hi==0)
+      n = 32 + count_leading_zeros<unsigned int>(lo);
+    else
+      n = count_leading_zeros<unsigned int>(hi);
+    return n;
 #else
     EIGEN_STATIC_ASSERT(sizeof(unsigned long long) == 8, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    return (sizeof(T) == 8) ?
-      __builtin_clzll(static_cast<uint64_t>(val)) :
-      __builtin_clz(static_cast<uint32_t>(val));
+    return __builtin_clzll(static_cast<uint64_t>(val));
 #endif
   }
 
@@ -98,7 +120,9 @@ namespace {
       return static_cast<uint64_t>((static_cast<__uint128_t>(1) << (64+log_div)) / static_cast<__uint128_t>(divider) - (static_cast<__uint128_t>(1) << 64) + 1);
 #else
       const uint64_t shift = 1ULL << log_div;
-      TensorUInt128<uint64_t, uint64_t> result = (TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider) - TensorUInt128<static_val<1>, static_val<0> >(1, 0) + TensorUInt128<static_val<0>, static_val<1> >(1));
+      TensorUInt128<uint64_t, uint64_t> result = TensorUInt128<uint64_t, static_val<0> >(shift, 0) / TensorUInt128<static_val<0>, uint64_t>(divider)
+                                               - TensorUInt128<static_val<1>, static_val<0> >(1, 0)
+                                               + TensorUInt128<static_val<0>, static_val<1> >(1);
       return static_cast<uint64_t>(result);
 #endif
     }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b1645d56f..fdb5ee6b8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -47,22 +47,39 @@ template <> struct max_n_1<0> {
 
 // Default packet types
 template <typename Scalar, typename Device>
-struct PacketType {
+struct PacketType : internal::packet_traits<Scalar> {
   typedef typename internal::packet_traits<Scalar>::type type;
-  enum { size = internal::unpacket_traits<type>::size };
 };
 
 // For CUDA packet types when using a GpuDevice
-#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__) && defined(EIGEN_HAS_CUDA_FP16)
 template <>
-struct PacketType<float, GpuDevice> {
-  typedef float4 type;
-  static const int size = 4;
-};
-template <>
-struct PacketType<double, GpuDevice> {
-  typedef double2 type;
+struct PacketType<half, GpuDevice> {
+  typedef half2 type;
   static const int size = 2;
+  enum {
+    HasAdd    = 1,
+    HasSub    = 1,
+    HasMul    = 1,
+    HasNegate = 1,
+    HasAbs    = 1,
+    HasArg    = 0,
+    HasAbs2   = 0,
+    HasMin    = 1,
+    HasMax    = 1,
+    HasConj   = 0,
+    HasSetLinear = 0,
+    HasBlend  = 0,
+
+    HasDiv    = 1,
+    HasSqrt   = 1,
+    HasRsqrt  = 1,
+    HasExp    = 1,
+    HasLog    = 1,
+    HasLog1p  = 0,
+    HasLog10  = 0,
+    HasPow    = 1,
+  };
 };
 #endif
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
index 52cfc2824..d34f1e328 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMorphing.h
@@ -148,7 +148,7 @@ struct TensorEvaluator<const TensorReshapingOp<NewDimensions, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC Scalar* data() const { return const_cast<Scalar*>(m_impl.data()); }
 
-  const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
+  EIGEN_DEVICE_FUNC const TensorEvaluator<ArgType, Device>& impl() const { return m_impl; }
 
  protected:
   TensorEvaluator<ArgType, Device> m_impl;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 99a09c058..9df697e4c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -264,7 +264,7 @@ struct FullReducer<Self, Op, ThreadPoolDevice, Vectorizable> {
     const Index numblocks = blocksize > 0 ? num_coeffs / blocksize : 0;
     eigen_assert(num_coeffs >= numblocks * blocksize);
 
-    Barrier barrier(numblocks);
+    Barrier barrier(internal::convert_index<unsigned int>(numblocks));
     MaxSizeVector<typename Self::CoeffReturnType> shards(numblocks, reducer.initialize());
     for (Index i = 0; i < numblocks; ++i) {
       device.enqueue_with_barrier(&barrier, &FullReducerShard<Self, Op, Vectorizable>::run,
@@ -316,7 +316,7 @@ struct OuterReducer {
 
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <int B, int N, typename S, typename R, typename I>
-__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+__global__ void FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 
 
 #ifdef EIGEN_HAS_CUDA_FP16
@@ -492,7 +492,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     }
 
     // Attempt to use an optimized reduction.
-    else if (RunningOnGPU && data && (m_device.majorDeviceVersion() >= 3)) {
+    else if (RunningOnGPU && (m_device.majorDeviceVersion() >= 3)) {
       bool reducing_inner_dims = true;
       for (int i = 0; i < NumReducedDims; ++i) {
         if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
@@ -505,8 +505,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+          data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          m_result = data;
+        }
         Op reducer(m_reducer);
-        return internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
 
       bool preserving_inner_dims = true;
@@ -521,8 +533,20 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
+        if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+          data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+          m_result = data;
+        }
         Op reducer(m_reducer);
-        return internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve);
+        if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
+          if (m_result) {
+            m_device.deallocate(m_result);
+            m_result = NULL;
+          }
+          return true;
+        } else {
+          return (m_result != NULL);
+        }
       }
     }
     return true;
@@ -537,8 +561,8 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType coeff(Index index) const
   {
-    if (RunningFullReduction && m_result) {
-      return *m_result;
+    if ((RunningFullReduction || RunningOnGPU) && m_result) {
+      return *(m_result + index);
     }
     Op reducer(m_reducer);
     if (ReducingInnerMostDims || RunningFullReduction) {
@@ -558,7 +582,11 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
     EIGEN_STATIC_ASSERT((PacketSize > 1), YOU_MADE_A_PROGRAMMING_MISTAKE)
-    eigen_assert(index + PacketSize - 1 < dimensions().TotalSize());
+    eigen_assert(index + PacketSize - 1 < Index(internal::array_prod(dimensions())));
+
+    if (RunningOnGPU && m_result) {
+      return internal::pload<PacketReturnType>(m_result + index);
+    }
 
     EIGEN_ALIGN_MAX typename internal::remove_const<CoeffReturnType>::type values[PacketSize];
     if (ReducingInnerMostDims) {
@@ -616,7 +644,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
   template <typename S, typename O, bool V> friend struct internal::FullReducerShard;
 #endif
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
-  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*);
+  template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernel(R, const S, I, typename S::CoeffReturnType*, unsigned int*);
 #ifdef EIGEN_HAS_CUDA_FP16
   template <typename S, typename R, typename I> friend void internal::ReductionInitFullReduxKernelHalfFloat(R, const S, I, half2*);
   template <int B, int N, typename S, typename R, typename I> friend void internal::FullReductionKernelHalfFloat(R, const S, I, half*, half2*);
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 45087a9a4..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,11 +67,21 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
 #endif
 }
 
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
 
 #ifdef EIGEN_HAS_CUDA_FP16
 template <template <typename T> class R>
 __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-#if __CUDA_ARCH__ >= 300
   unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
   unsigned int newval = oldval;
   reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
@@ -87,9 +97,6 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
       return;
     }
   }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
 }
 #endif
 
@@ -112,18 +119,44 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
   }
 }
 
+
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
-                                    typename Self::CoeffReturnType* output) {
+                                    typename Self::CoeffReturnType* output, unsigned int* semaphore) {
+#if __CUDA_ARCH__ >= 300
+  // Initialize the output value
   const Index first_index = blockIdx.x * BlockSize * NumPerThread + threadIdx.x;
-
-  // Initialize the output value if it wasn't initialized by the ReductionInitKernel
-  if (gridDim.x == 1 && first_index == 0) {
-    *output = reducer.initialize();
-    __syncthreads();
+  if (gridDim.x == 1) {
+    if (first_index == 0) {
+      *output = reducer.initialize();
+    }
+  }
+  else {
+    if (threadIdx.x == 0) {
+      unsigned int block = atomicCAS(semaphore, 0u, 1u);
+      if (block == 0) {
+        // We're the first block to run, initialize the output value
+        atomicExchCustom(output, reducer.initialize());
+        __threadfence();
+        atomicExch(semaphore, 2u);
+      }
+      else {
+        // Wait for the first block to initialize the output value.
+        // Use atomicCAS here to ensure that the reads aren't cached
+        unsigned int val;
+        do {
+          val = atomicCAS(semaphore, 2u, 2u);
+        }
+        while (val < 2u);
+      }
+    }
   }
 
+  __syncthreads();
+
+  eigen_assert(gridDim.x == 1 || *semaphore >= 2u);
+
   typename Self::CoeffReturnType accum = reducer.initialize();
   Index max_iter = numext::mini<Index>(num_coeffs - first_index, NumPerThread*BlockSize);
   for (Index i = 0; i < max_iter; i+=BlockSize) {
@@ -141,6 +174,14 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
   if ((threadIdx.x & (warpSize - 1)) == 0) {
     atomicReduce(output, accum, reducer);
   }
+
+  if (gridDim.x > 1 && threadIdx.x == 0) {
+    // Let the last block reset the semaphore
+    atomicInc(semaphore, gridDim.x + 1);
+  }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
 }
 
 
@@ -229,32 +270,35 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
 
 #endif
 
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct FullReductionLauncher {
   static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on floats and half floats");
+    assert(false && "Should only be called on doubles, floats and half floats");
   }
 };
 
-template <typename Self, typename Op, bool PacketAccess>
-struct FullReductionLauncher<Self, Op, float, PacketAccess> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
     typedef typename Self::CoeffReturnType Scalar;
     const int block_size = 256;
     const int num_per_thread = 128;
     const int num_blocks = divup<int>(num_coeffs, block_size * num_per_thread);
 
+    unsigned int* semaphore = NULL;
     if (num_blocks > 1) {
-      // We initialize the outputs outside the reduction kernel when we can't be sure that there
-      // won't be a race conditions between multiple thread blocks.
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<Scalar, Index>),
-                         1, 32, 0, device, reducer.initialize(), 1, output);
+      semaphore = device.semaphore();
     }
 
     LAUNCH_CUDA_KERNEL((FullReductionKernel<block_size, num_per_thread, Self, Op, Index>),
-                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output);
+                       num_blocks, block_size, 0, device, reducer, self, num_coeffs, output, semaphore);
   }
 };
 
@@ -298,27 +342,29 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
-  #ifdef EIGEN_HAS_CUDA_FP16
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
+#ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
 #endif
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     // Don't crash when we're called with an input tensor of size 0.
     if (num_coeffs == 0) {
       return;
     }
 
-    FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
   }
 };
 
@@ -327,6 +373,8 @@ template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                          typename Self::CoeffReturnType* output) {
+#if __CUDA_ARCH__ >= 300
+  typedef typename Self::CoeffReturnType Type;
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
@@ -356,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
       const Index col_block = i % input_col_blocks;
       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
 
-      float reduced_val = reducer.initialize();
+      Type reduced_val = reducer.initialize();
 
       for (Index j = 0; j < NumPerThread; j += unroll_times) {
         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
         if (last_col >= num_coeffs_to_reduce) {
           for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
             reducer.reduce(val, &reduced_val);
           }
           break;
@@ -386,6 +434,9 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
       }
     }
   }
+#else
+  assert(0 && "Shouldn't be called on unsupported device");
+#endif
 }
 
 #ifdef EIGEN_HAS_CUDA_FP16
@@ -485,17 +536,23 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
 
 #endif
 
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct InnerReductionLauncher {
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats and half floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
     return true;
   }
 };
 
-template <typename Self, typename Op, bool PacketAccess>
-struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@@ -513,7 +570,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
       const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / 1024;
       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
                          num_blocks, 1024, 0, device, reducer.initialize(),
                          num_preserved_vals, output);
     }
@@ -580,15 +637,17 @@ struct InnerReducer<Self, Op, GpuDevice> {
 #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
 #endif
 
   template <typename OutputType>
   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     // Don't crash when we're called with an input tensor of size 0.
     if (num_coeffs == 0) {
@@ -599,7 +658,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
       return true;
     }
 
-    return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
   }
 };
 
@@ -639,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
   // so reduce the scope of the optimized version of the code to the simple case
   // of floats.
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
   template <typename Device, typename OutputType>
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
     return true;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 031dbf6f2..8501466ce 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -9,9 +9,11 @@
 
 #ifndef EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
 #define EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
+
 namespace Eigen {
 
 namespace internal {
+
 template <typename Op, typename XprType>
 struct traits<TensorScanOp<Op, XprType> >
     : public traits<XprType> {
@@ -42,9 +44,7 @@ struct nested<TensorScanOp<Op, XprType>, 1,
   * \ingroup CXX11_Tensor_Module
   *
   * \brief Tensor scan class.
-  *
   */
-
 template <typename Op, typename XprType>
 class TensorScanOp
     : public TensorBase<TensorScanOp<Op, XprType>, ReadOnlyAccessors> {
@@ -57,8 +57,8 @@ public:
   typedef typename Eigen::internal::traits<TensorScanOp>::Index Index;
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorScanOp(
-      const XprType& expr, const Index& axis, const Op& op = Op())
-      : m_expr(expr), m_axis(axis), m_accumulator(op) {}
+      const XprType& expr, const Index& axis, bool exclusive = false, const Op& op = Op())
+      : m_expr(expr), m_axis(axis), m_accumulator(op), m_exclusive(exclusive) {}
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Index axis() const { return m_axis; }
@@ -66,13 +66,19 @@ public:
   const XprType& expression() const { return m_expr; }
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   const Op accumulator() const { return m_accumulator; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+  bool exclusive() const { return m_exclusive; }
 
 protected:
   typename XprType::Nested m_expr;
   const Index m_axis;
   const Op m_accumulator;
+  const bool m_exclusive;
 };
 
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher;
+
 // Eval as rvalue
 template <typename Op, typename ArgType, typename Device>
 struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
@@ -81,13 +87,14 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
   typedef typename XprType::Index Index;
   static const int NumDims = internal::array_size<typename TensorEvaluator<ArgType, Device>::Dimensions>::value;
   typedef DSizes<Index, NumDims> Dimensions;
-  typedef typename XprType::Scalar Scalar;
+  typedef typename internal::remove_const<typename XprType::Scalar>::type Scalar;
   typedef typename XprType::CoeffReturnType CoeffReturnType;
   typedef typename PacketType<CoeffReturnType, Device>::type PacketReturnType;
+  typedef TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> Self;
 
   enum {
     IsAligned = false,
-    PacketAccess = (internal::packet_traits<Scalar>::size > 1),
+    PacketAccess = (internal::unpacket_traits<PacketReturnType>::size > 1),
     BlockAccess = false,
     Layout = TensorEvaluator<ArgType, Device>::Layout,
     CoordAccess = false,
@@ -98,45 +105,71 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
                                                         const Device& device)
       : m_impl(op.expression(), device),
         m_device(device),
-        m_axis(op.axis()),
+        m_exclusive(op.exclusive()),
         m_accumulator(op.accumulator()),
-        m_dimensions(m_impl.dimensions()),
-        m_size(m_dimensions[m_axis]),
+        m_size(m_impl.dimensions()[op.axis()]),
         m_stride(1),
         m_output(NULL) {
 
     // Accumulating a scalar isn't supported.
-    EIGEN_STATIC_ASSERT(NumDims > 0, YOU_MADE_A_PROGRAMMING_MISTAKE);
-    eigen_assert(m_axis >= 0 && m_axis < NumDims);
+    EIGEN_STATIC_ASSERT((NumDims > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+    eigen_assert(op.axis() >= 0 && op.axis() < NumDims);
 
     // Compute stride of scan axis
+    const Dimensions& dims = m_impl.dimensions();
     if (static_cast<int>(Layout) == static_cast<int>(ColMajor)) {
-      for (int i = 0; i < m_axis; ++i) {
-        m_stride = m_stride * m_dimensions[i];
+      for (int i = 0; i < op.axis(); ++i) {
+        m_stride = m_stride * dims[i];
       }
     } else {
-      for (int i = NumDims - 1; i > m_axis; --i) {
-        m_stride = m_stride * m_dimensions[i];
+      for (int i = NumDims - 1; i > op.axis(); --i) {
+        m_stride = m_stride * dims[i];
       }
     }
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-      return m_dimensions;
+    return m_impl.dimensions();
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& stride() const {
+    return m_stride;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Index& size() const {
+    return m_size;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Op& accumulator() const {
+    return m_accumulator;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool exclusive() const {
+    return m_exclusive;
   }
 
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const TensorEvaluator<ArgType, Device>& inner() const {
+    return m_impl;
+  }
+
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Device& device() const {
+    return m_device;
+  }
+
+  EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {
     m_impl.evalSubExprsIfNeeded(NULL);
+    ScanLauncher<Self, Op, Device> launcher;
     if (data) {
-      accumulateTo(data);
+      launcher(*this, data);
       return false;
-    } else {
-      m_output = static_cast<CoeffReturnType*>(m_device.allocate(dimensions().TotalSize() * sizeof(Scalar)));
-      accumulateTo(m_output);
-      return true;
     }
+
+    const Index total_size = internal::array_prod(dimensions());
+    m_output = static_cast<CoeffReturnType*>(m_device.allocate(total_size * sizeof(Scalar)));
+    launcher(*this, m_output);
+    return true;
   }
-  
+
   template<int LoadMode>
   EIGEN_DEVICE_FUNC PacketReturnType packet(Index index) const {
     return internal::ploadt<PacketReturnType, LoadMode>(m_output + index);
@@ -152,6 +185,10 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
     return m_output[index];
   }
 
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost costPerCoeff(bool) const {
+    return TensorOpCost(sizeof(CoeffReturnType), 0, 0);
+  }
+
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void cleanup() {
     if (m_output != NULL) {
       m_device.deallocate(m_output);
@@ -163,35 +200,88 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
 protected:
   TensorEvaluator<ArgType, Device> m_impl;
   const Device& m_device;
-  const Index m_axis;
+  const bool m_exclusive;
   Op m_accumulator;
-  const Dimensions& m_dimensions;
-  const Index& m_size;
+  const Index m_size;
   Index m_stride;
   CoeffReturnType* m_output;
+};
+
+// CPU implementation of scan
+// TODO(ibab) This single-threaded implementation should be parallelized,
+// at least by running multiple scans at the same time.
+template <typename Self, typename Reducer, typename Device>
+struct ScanLauncher {
+  void operator()(Self& self, typename Self::CoeffReturnType *data) {
+    Index total_size = internal::array_prod(self.dimensions());
 
-  // TODO(ibab) Parallelize this single-threaded implementation if desired
-  EIGEN_DEVICE_FUNC void accumulateTo(Scalar* data) {
-    // We fix the index along the scan axis to 0 and perform an
+    // We fix the index along the scan axis to 0 and perform a
     // scan per remaining entry. The iteration is split into two nested
     // loops to avoid an integer division by keeping track of each idx1 and idx2.
-    for (Index idx1 = 0; idx1 < dimensions().TotalSize() / m_size; idx1 += m_stride) {
-       for (Index idx2 = 0; idx2 < m_stride; idx2++) {
-          // Calculate the starting offset for the scan
-          Index offset = idx1 * m_size + idx2;
-
-          // Compute the prefix sum along the axis, starting at the calculated offset
-          CoeffReturnType accum = m_accumulator.initialize();
-          for (Index idx3 = 0; idx3 < m_size; idx3++) {
-            Index curr = offset + idx3 * m_stride;
-            m_accumulator.reduce(m_impl.coeff(curr), &accum);
-            data[curr] = m_accumulator.finalize(accum);
+    for (Index idx1 = 0; idx1 < total_size; idx1 += self.stride() * self.size()) {
+      for (Index idx2 = 0; idx2 < self.stride(); idx2++) {
+        // Calculate the starting offset for the scan
+        Index offset = idx1 + idx2;
+
+        // Compute the scan along the axis, starting at the calculated offset
+        typename Self::CoeffReturnType accum = self.accumulator().initialize();
+        for (Index idx3 = 0; idx3 < self.size(); idx3++) {
+          Index curr = offset + idx3 * self.stride();
+
+          if (self.exclusive()) {
+            data[curr] = self.accumulator().finalize(accum);
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+          } else {
+            self.accumulator().reduce(self.inner().coeff(curr), &accum);
+            data[curr] = self.accumulator().finalize(accum);
           }
-       }
+        }
+      }
     }
   }
 };
 
+#if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
+
+// GPU implementation of scan
+// TODO(ibab) This placeholder implementation performs multiple scans in
+// parallel, but it would be better to use a parallel scan algorithm and
+// optimize memory access.
+template <typename Self, typename Reducer>
+__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+  // Compute offset as in the CPU version
+  Index val = threadIdx.x + blockIdx.x * blockDim.x;
+  Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
+
+  if (offset + (self.size() - 1) * self.stride() < total_size) {
+    // Compute the scan along the axis, starting at the calculated offset
+    typename Self::CoeffReturnType accum = self.accumulator().initialize();
+    for (Index idx = 0; idx < self.size(); idx++) {
+      Index curr = offset + idx * self.stride();
+      if (self.exclusive()) {
+        data[curr] = self.accumulator().finalize(accum);
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+      } else {
+        self.accumulator().reduce(self.inner().coeff(curr), &accum);
+        data[curr] = self.accumulator().finalize(accum);
+      }
+    }
+  }
+  __syncthreads();
+
+}
+
+template <typename Self, typename Reducer>
+struct ScanLauncher<Self, Reducer, GpuDevice> {
+  void operator()(const Self& self, typename Self::CoeffReturnType* data) {
+     Index total_size = internal::array_prod(self.dimensions());
+     Index num_blocks = (total_size / self.size() + 63) / 64;
+     Index block_size = 64;
+     LAUNCH_CUDA_KERNEL((ScanKernel<Self, Reducer>), num_blocks, block_size, 0, self.device(), self, total_size, data);
+  }
+};
+#endif  // EIGEN_USE_GPU && __CUDACC__
+
 }  // end namespace Eigen
 
 #endif  // EIGEN_CXX11_TENSOR_TENSOR_SCAN_H
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
index bdcd70fd9..3523e7c94 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorUInt128.h
@@ -20,6 +20,7 @@ struct static_val {
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE operator uint64_t() const { return n; }
 
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val() { }
+
   template <typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE static_val(const T& v) {
     eigen_assert(v == n);
@@ -53,7 +54,7 @@ struct TensorUInt128
   template<typename T>
   EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
   explicit TensorUInt128(const T& x) : high(0), low(x) {
-    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= static_cast<typename conditional<sizeof(LOW) == 8, uint64_t, uint32_t>::type>(NumTraits<LOW>::highest())));
+    eigen_assert((static_cast<typename conditional<sizeof(T) == 8, uint64_t, uint32_t>::type>(x) <= NumTraits<uint64_t>::highest()));
     eigen_assert(x >= 0);
   }
 
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
-  )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_ThreadPool_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 6dd64f185..12b80d6c4 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -169,7 +169,8 @@ class EventCount {
 
   class Waiter {
     friend class EventCount;
-    std::atomic<Waiter*> next;
+    // Align to 128 byte boundary to prevent false sharing with other Waiter objects in the same vector.
+    EIGEN_ALIGN_TO_BOUNDARY(128) std::atomic<Waiter*> next;
     std::mutex mu;
     std::condition_variable cv;
     uint64_t epoch;
@@ -179,8 +180,6 @@ class EventCount {
       kWaiting,
       kSignaled,
     };
-    // Prevent false sharing with other Waiter objects in the same vector.
-    char pad_[128];
   };
 
  private:
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index c094563b7..33ae45131 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -74,7 +74,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     PerThread* pt = GetPerThread();
     if (pt->pool == this) {
       // Worker thread of this pool, push onto the thread's queue.
-      Queue* q = queues_[pt->index];
+      Queue* q = queues_[pt->thread_id];
       t = q->PushFront(std::move(t));
     } else {
       // A free-standing thread (or worker of another pool), push onto a random
@@ -95,14 +95,28 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
       env_.ExecuteTask(t);  // Push failed, execute directly.
   }
 
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt =
+        const_cast<NonBlockingThreadPoolTempl*>(this)->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
  private:
   typedef typename Environment::EnvThread Thread;
 
   struct PerThread {
-    bool inited;
+    constexpr PerThread() : pool(NULL), rand(0), thread_id(-1) { }
     NonBlockingThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
-    unsigned index;         // Worker thread index in pool.
-    unsigned rand;          // Random generator state.
+    uint64_t rand;  // Random generator state.
+    int thread_id;  // Worker thread index in pool.
   };
 
   Environment env_;
@@ -116,12 +130,13 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   EventCount ec_;
 
   // Main worker thread loop.
-  void WorkerLoop(unsigned index) {
+  void WorkerLoop(int thread_id) {
     PerThread* pt = GetPerThread();
     pt->pool = this;
-    pt->index = index;
-    Queue* q = queues_[index];
-    EventCount::Waiter* waiter = &waiters_[index];
+    pt->rand = std::hash<std::thread::id>()(std::this_thread::get_id());
+    pt->thread_id = thread_id;
+    Queue* q = queues_[thread_id];
+    EventCount::Waiter* waiter = &waiters_[thread_id];
     for (;;) {
       Task t = q->PopFront();
       if (!t.f) {
@@ -235,17 +250,18 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
     return -1;
   }
 
-  PerThread* GetPerThread() {
+  static EIGEN_STRONG_INLINE PerThread* GetPerThread() {
     EIGEN_THREAD_LOCAL PerThread per_thread_;
     PerThread* pt = &per_thread_;
-    if (pt->inited) return pt;
-    pt->inited = true;
-    pt->rand = static_cast<unsigned>(std::hash<std::thread::id>()(std::this_thread::get_id()));
     return pt;
   }
 
-  static unsigned Rand(unsigned* state) {
-    return *state = *state * 1103515245 + 12345;
+  static EIGEN_STRONG_INLINE unsigned Rand(uint64_t* state) {
+    uint64_t current = *state;
+    // Update the internal state
+    *state = current * 6364136223846793005ULL + 0xda3e39cb94b95bdbULL;
+    // Generate the random output (using the PCG-XSH-RS scheme)
+    return static_cast<unsigned>((current ^ (current >> 22)) >> (22 + (current >> 61)));
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
index 17fd1658b..e75d0f467 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/SimpleThreadPool.h
@@ -24,7 +24,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
   explicit SimpleThreadPoolTempl(int num_threads, Environment env = Environment())
       : env_(env), threads_(num_threads), waiters_(num_threads) {
     for (int i = 0; i < num_threads; i++) {
-      threads_.push_back(env.CreateThread([this]() { WorkerLoop(); }));
+      threads_.push_back(env.CreateThread([this, i]() { WorkerLoop(i); }));
     }
   }
 
@@ -55,7 +55,7 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
 
   // Schedule fn() for execution in the pool of threads. The functions are
   // executed in the order in which they are scheduled.
-  void Schedule(std::function<void()> fn) {
+  void Schedule(std::function<void()> fn) final {
     Task t = env_.CreateTask(std::move(fn));
     std::unique_lock<std::mutex> l(mu_);
     if (waiters_.empty()) {
@@ -69,9 +69,25 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
     }
   }
 
+  int NumThreads() const final {
+    return static_cast<int>(threads_.size());
+  }
+
+  int CurrentThreadId() const final {
+    const PerThread* pt = this->GetPerThread();
+    if (pt->pool == this) {
+      return pt->thread_id;
+    } else {
+      return -1;
+    }
+  }
+
  protected:
-  void WorkerLoop() {
+  void WorkerLoop(int thread_id) {
     std::unique_lock<std::mutex> l(mu_);
+    PerThread* pt = GetPerThread();
+    pt->pool = this;
+    pt->thread_id = thread_id;
     Waiter w;
     Task t;
     while (!exiting_) {
@@ -111,13 +127,24 @@ class SimpleThreadPoolTempl : public ThreadPoolInterface {
     bool ready;
   };
 
+  struct PerThread {
+    constexpr PerThread() : pool(NULL), thread_id(-1) { }
+    SimpleThreadPoolTempl* pool;  // Parent pool, or null for normal threads.
+    int thread_id;                // Worker thread index in pool.
+  };
+
   Environment env_;
   std::mutex mu_;
   MaxSizeVector<Thread*> threads_;  // All threads
   MaxSizeVector<Waiter*> waiters_;  // Stack of waiting threads.
-  std::deque<Task> pending_;          // Queue of pending work
-  std::condition_variable empty_;          // Signaled on pending_.empty()
+  std::deque<Task> pending_;        // Queue of pending work
+  std::condition_variable empty_;   // Signaled on pending_.empty()
   bool exiting_ = false;
+
+  PerThread* GetPerThread() const {
+    EIGEN_THREAD_LOCAL PerThread per_thread;
+    return &per_thread;
+  }
 };
 
 typedef SimpleThreadPoolTempl<StlThreadEnvironment> SimpleThreadPool;
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
index d2204ad5b..399f95cc1 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadEnvironment.h
@@ -21,14 +21,14 @@ struct StlThreadEnvironment {
   // destructor must join the thread.
   class EnvThread {
    public:
-    EnvThread(std::function<void()> f) : thr_(f) {}
+    EnvThread(std::function<void()> f) : thr_(std::move(f)) {}
     ~EnvThread() { thr_.join(); }
 
    private:
     std::thread thr_;
   };
 
-  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(f); }
+  EnvThread* CreateThread(std::function<void()> f) { return new EnvThread(std::move(f)); }
   Task CreateTask(std::function<void()> f) { return Task{std::move(f)}; }
   void ExecuteTask(const Task& t) { t.f(); }
 };
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
index 38b40aceb..a65ee97c9 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/ThreadPoolInterface.h
@@ -18,6 +18,13 @@ class ThreadPoolInterface {
  public:
   virtual void Schedule(std::function<void()> fn) = 0;
 
+  // Returns the number of threads in the pool.
+  virtual int NumThreads() const = 0;
+
+  // Returns a logical thread index between 0 and NumThreads() - 1 if called
+  // from one of the threads in the pool. Returns -1 otherwise.
+  virtual int CurrentThreadId() const = 0;
+
   virtual ~ThreadPoolInterface() {}
 };
 
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index c932c06a6..5f5afb8cf 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct
@@ -13,6 +13,8 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
 namespace Eigen {
 
 /**
diff --git a/unsupported/Eigen/MPRealSupport b/unsupported/Eigen/MPRealSupport
index 89036886b..7f0b70c63 100644
--- a/unsupported/Eigen/MPRealSupport
+++ b/unsupported/Eigen/MPRealSupport
@@ -67,27 +67,32 @@ int main()
       IsSigned = 1,
       IsComplex = 0,
       RequireInitialization = 1,
-      ReadCost = 10,
-      AddCost = 10,
-      MulCost = 40
+      ReadCost = HugeCost,
+      AddCost  = HugeCost,
+      MulCost  = HugeCost
     };
 
     typedef mpfr::mpreal Real;
     typedef mpfr::mpreal NonInteger;
     
-    inline static Real highest   (long Precision = mpfr::mpreal::get_default_prec())  { return  mpfr::maxval(Precision); }
-    inline static Real lowest    (long Precision = mpfr::mpreal::get_default_prec())  { return -mpfr::maxval(Precision); }
+    static inline Real highest  (long Precision = mpfr::mpreal::get_default_prec()) { return  mpfr::maxval(Precision); }
+    static inline Real lowest   (long Precision = mpfr::mpreal::get_default_prec()) { return -mpfr::maxval(Precision); }
 
     // Constants
-    inline static Real Pi       (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_pi(Precision);        }
-    inline static Real Euler    (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_euler(Precision);     }
-    inline static Real Log2     (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_log2(Precision);      }
-    inline static Real Catalan  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::const_catalan(Precision);   }
+    static inline Real Pi      (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_pi(Precision);        }
+    static inline Real Euler   (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_euler(Precision);     }
+    static inline Real Log2    (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_log2(Precision);      }
+    static inline Real Catalan (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::const_catalan(Precision);   }
 
-    inline static Real epsilon  (long Precision = mpfr::mpreal::get_default_prec())     {    return mpfr::machine_epsilon(Precision); }
-    inline static Real epsilon  (const Real& x)                                         {    return mpfr::machine_epsilon(x); }
+    static inline Real epsilon (long Precision = mpfr::mpreal::get_default_prec())  { return mpfr::machine_epsilon(Precision); }
+    static inline Real epsilon (const Real& x)                                      { return mpfr::machine_epsilon(x); }
 
-    inline static Real dummy_precision()   
+#ifdef MPREAL_HAVE_DYNAMIC_STD_NUMERIC_LIMITS
+    static inline int digits10 (long Precision = mpfr::mpreal::get_default_prec())  { return std::numeric_limits<Real>::digits10(Precision); }
+    static inline int digits10 (const Real& x)                                      { return std::numeric_limits<Real>::digits10(x); }
+#endif
+
+    static inline Real dummy_precision()
     {
       mpfr_prec_t weak_prec = ((mpfr::mpreal::get_default_prec()-1) * 90) / 100;
       return mpfr::machine_epsilon(weak_prec);
diff --git a/unsupported/Eigen/SpecialFunctions b/unsupported/Eigen/SpecialFunctions
new file mode 100644
index 000000000..7c7493c56
--- /dev/null
+++ b/unsupported/Eigen/SpecialFunctions
@@ -0,0 +1,61 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <g.gael@free.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_MODULE
+#define EIGEN_SPECIALFUNCTIONS_MODULE
+
+#include "../../Eigen/Core"
+
+#include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup SpecialFunctions_Module Special math functions module
+  *
+  * This module features additional coefficient-wise math functions available
+  * within the numext:: namespace for the scalar version, and as method and/or free
+  * functions of Array. Those include:
+  *
+  * - erf
+  * - erfc
+  * - lgamma
+  * - igamma
+  * - igammac
+  * - digamma
+  * - polygamma
+  * - zeta
+  * - betainc
+  *
+  * \code
+  * #include <unsupported/Eigen/SpecialFunctions>
+  * \endcode
+  */
+//@{
+
+}
+
+#include "src/SpecialFunctions/SpecialFunctionsImpl.h"
+#include "src/SpecialFunctions/SpecialFunctionsPacketMath.h"
+#include "src/SpecialFunctions/SpecialFunctionsHalf.h"
+#include "src/SpecialFunctions/SpecialFunctionsFunctors.h"
+#include "src/SpecialFunctions/SpecialFunctionsArrayAPI.h"
+
+#if defined EIGEN_VECTORIZE_CUDA
+  #include "src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h"
+#endif
+
+namespace Eigen {
+//@}
+}
+
+
+#include "../../Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_SPECIALFUNCTIONS_MODULE
diff --git a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
index 089042751..50fedf6ac 100755
--- a/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
+++ b/unsupported/Eigen/src/AutoDiff/AutoDiffScalar.h
@@ -30,6 +30,13 @@ template<typename _DerType, bool Enable> struct auto_diff_special_op;
 
 } // end namespace internal
 
+template<typename _DerType> class AutoDiffScalar;
+
+template<typename NewDerType>
+inline AutoDiffScalar<NewDerType> MakeAutoDiffScalar(const typename NewDerType::Scalar& value, const NewDerType &der) {
+  return AutoDiffScalar<NewDerType>(value,der);
+}
+
 /** \class AutoDiffScalar
   * \brief A scalar type replacement with automatic differentation capability
   *
@@ -60,7 +67,7 @@ template<typename _DerType>
 class AutoDiffScalar
   : public internal::auto_diff_special_op
             <_DerType, !internal::is_same<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar,
-                                        typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
+                                          typename NumTraits<typename internal::traits<typename internal::remove_all<_DerType>::type>::Scalar>::Real>::value>
 {
   public:
     typedef internal::auto_diff_special_op
@@ -257,20 +264,16 @@ class AutoDiffScalar
         -m_derivatives);
     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator*(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value * other,
-        (m_derivatives * other));
+      return MakeAutoDiffScalar(m_value * other, m_derivatives * other);
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator*(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        a.value() * other,
-        a.derivatives() * other);
+      return MakeAutoDiffScalar(a.value() * other, a.derivatives() * other);
     }
 
 //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -289,20 +292,16 @@ class AutoDiffScalar
 //         a.derivatives() * other);
 //     }
 
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator/(const Scalar& other) const
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        m_value / other,
-        (m_derivatives * (Scalar(1)/other)));
+      return MakeAutoDiffScalar(m_value / other, (m_derivatives * (Scalar(1)/other)));
     }
 
-    friend inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >
+    friend inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) >
     operator/(const Scalar& other, const AutoDiffScalar& a)
     {
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType> >(
-        other / a.value(),
-        a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
+      return MakeAutoDiffScalar(other / a.value(), a.derivatives() * (Scalar(-other) / (a.value()*a.value())));
     }
 
 //     inline const AutoDiffScalar<typename CwiseUnaryOp<internal::scalar_multiple_op<Real>, DerType>::Type >
@@ -322,34 +321,29 @@ class AutoDiffScalar
 //     }
 
     template<typename OtherDerType>
-    inline const AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >
+    inline const AutoDiffScalar<EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(
+        CwiseBinaryOp<internal::scalar_difference_op<Scalar> EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product) EIGEN_COMMA
+          const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) >,Scalar,product) >
     operator/(const AutoDiffScalar<OtherDerType>& other) const
     {
       internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<CwiseUnaryOp<internal::scalar_multiple_op<Scalar>,
-        const CwiseBinaryOp<internal::scalar_difference_op<Scalar>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-          const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > > >(
+      return MakeAutoDiffScalar(
         m_value / other.value(),
-          ((m_derivatives * other.value()) - (m_value * other.derivatives()))
+          ((m_derivatives * other.value()) - (other.derivatives() * m_value))
         * (Scalar(1)/(other.value()*other.value())));
     }
 
     template<typename OtherDerType>
     inline const AutoDiffScalar<CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type> > >
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(DerType,Scalar,product),
+        const EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<OtherDerType>::type,Scalar,product) > >
     operator*(const AutoDiffScalar<OtherDerType>& other) const
     {
       internal::make_coherent(m_derivatives, other.derivatives());
-      return AutoDiffScalar<const CwiseBinaryOp<internal::scalar_sum_op<Scalar>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const DerType>,
-        const CwiseUnaryOp<internal::scalar_multiple_op<Scalar>, const typename internal::remove_all<OtherDerType>::type > > >(
+      return MakeAutoDiffScalar(
         m_value * other.value(),
-        (m_derivatives * other.value()) + (m_value * other.derivatives()));
+        (m_derivatives * other.value()) + (other.derivatives() * m_value));
     }
 
     inline AutoDiffScalar& operator*=(const Scalar& other)
@@ -426,18 +420,18 @@ struct auto_diff_special_op<_DerType, true>
   }
 
 
-  inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  inline const AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >
   operator*(const Real& other) const
   {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+    return AutoDiffScalar<typename CwiseUnaryOp<bind2nd_op<scalar_product_op<Scalar,Real> >, DerType>::Type >(
       derived().value() * other,
       derived().derivatives() * other);
   }
 
-  friend inline const AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >
+  friend inline const AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >
   operator*(const Real& other, const AutoDiffScalar<_DerType>& a)
   {
-    return AutoDiffScalar<typename CwiseUnaryOp<scalar_multiple2_op<Scalar,Real>, DerType>::Type >(
+    return AutoDiffScalar<typename CwiseUnaryOp<bind1st_op<scalar_product_op<Real,Scalar> >, DerType>::Type >(
       a.value() * other,
       a.derivatives() * other);
   }
@@ -501,43 +495,44 @@ struct make_coherent_impl<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows,
   }
 };
 
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols>,A_Scalar>
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
-
-template<typename A_Scalar, int A_Rows, int A_Cols, int A_Options, int A_MaxRows, int A_MaxCols>
-struct scalar_product_traits<A_Scalar, Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> >
-{
-  enum { Defined = 1 };
-  typedef Matrix<A_Scalar, A_Rows, A_Cols, A_Options, A_MaxRows, A_MaxCols> ReturnType;
-};
+} // end namespace internal
 
-template<typename DerType>
-struct scalar_product_traits<AutoDiffScalar<DerType>,typename DerType::Scalar>
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,typename DerType::Scalar,BinOp>
 {
-  enum { Defined = 1 };
   typedef AutoDiffScalar<DerType> ReturnType;
 };
 
-template<typename DerType>
-struct scalar_product_traits<typename DerType::Scalar,AutoDiffScalar<DerType> >
+template<typename DerType, typename BinOp>
+struct ScalarBinaryOpTraits<typename DerType::Scalar,AutoDiffScalar<DerType>, BinOp>
 {
-  enum { Defined = 1 };
   typedef AutoDiffScalar<DerType> ReturnType;
 };
 
-} // end namespace internal
+
+// The following is an attempt to let Eigen's known about expression template, but that's more tricky!
+
+// template<typename DerType, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType>,AutoDiffScalar<DerType>, BinOp>
+// {
+//   enum { Defined = 1 };
+//   typedef AutoDiffScalar<typename DerType::PlainObject> ReturnType;
+// };
+//
+// template<typename DerType1,typename DerType2, typename BinOp>
+// struct ScalarBinaryOpTraits<AutoDiffScalar<DerType1>,AutoDiffScalar<DerType2>, BinOp>
+// {
+//   enum { Defined = 1 };//internal::is_same<typename DerType1::Scalar,typename DerType2::Scalar>::value };
+//   typedef AutoDiffScalar<typename DerType1::PlainObject> ReturnType;
+// };
 
 #define EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(FUNC,CODE) \
   template<typename DerType> \
-  inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > \
+  inline const Eigen::AutoDiffScalar< \
+  EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename Eigen::internal::remove_all<DerType>::type, typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar, product) > \
   FUNC(const Eigen::AutoDiffScalar<DerType>& x) { \
     using namespace Eigen; \
-    typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
-    typedef AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const typename Eigen::internal::remove_all<DerType>::type> > ReturnType; \
+    EIGEN_UNUSED typedef typename Eigen::internal::traits<typename Eigen::internal::remove_all<DerType>::type>::Scalar Scalar; \
     CODE; \
   }
 
@@ -567,49 +562,56 @@ inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::Plain
   typedef AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> ADS;
   return (x > y ? ADS(x) : ADS(y));
 }
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (min)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() < y.value() ? x : y);
+}
+template<typename DerType>
+inline AutoDiffScalar<typename Eigen::internal::remove_all<DerType>::type::PlainObject> (max)(const AutoDiffScalar<DerType>& x, const AutoDiffScalar<DerType>& y) {
+  return (x.value() >= y.value() ? x : y);
+}
+
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs,
   using std::abs;
-  return ReturnType(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
+  return Eigen::MakeAutoDiffScalar(abs(x.value()), x.derivatives() * (x.value()<0 ? -1 : 1) );)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(abs2,
   using numext::abs2;
-  return ReturnType(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
+  return Eigen::MakeAutoDiffScalar(abs2(x.value()), x.derivatives() * (Scalar(2)*x.value()));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sqrt,
   using std::sqrt;
   Scalar sqrtx = sqrt(x.value());
-  return ReturnType(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
+  return Eigen::MakeAutoDiffScalar(sqrtx,x.derivatives() * (Scalar(0.5) / sqrtx));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cos,
   using std::cos;
   using std::sin;
-  return ReturnType(cos(x.value()), x.derivatives() * (-sin(x.value())));)
+  return Eigen::MakeAutoDiffScalar(cos(x.value()), x.derivatives() * (-sin(x.value())));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sin,
   using std::sin;
   using std::cos;
-  return ReturnType(sin(x.value()),x.derivatives() * cos(x.value()));)
+  return Eigen::MakeAutoDiffScalar(sin(x.value()),x.derivatives() * cos(x.value()));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(exp,
   using std::exp;
   Scalar expx = exp(x.value());
-  return ReturnType(expx,x.derivatives() * expx);)
+  return Eigen::MakeAutoDiffScalar(expx,x.derivatives() * expx);)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(log,
   using std::log;
-  return ReturnType(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
+  return Eigen::MakeAutoDiffScalar(log(x.value()),x.derivatives() * (Scalar(1)/x.value()));)
 
 template<typename DerType>
-inline const Eigen::AutoDiffScalar<Eigen::CwiseUnaryOp<Eigen::internal::scalar_multiple_op<typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar>, const typename internal::remove_all<DerType>::type> >
-pow(const Eigen::AutoDiffScalar<DerType>& x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
+inline const Eigen::AutoDiffScalar<
+EIGEN_EXPR_BINARYOP_SCALAR_RETURN_TYPE(typename internal::remove_all<DerType>::type,typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar,product) >
+pow(const Eigen::AutoDiffScalar<DerType> &x, const typename internal::traits<typename internal::remove_all<DerType>::type>::Scalar &y)
 {
   using namespace Eigen;
-  typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
-  typedef typename Eigen::internal::traits<DerTypeCleaned>::Scalar Scalar;
-  return AutoDiffScalar<CwiseUnaryOp<Eigen::internal::scalar_multiple_op<Scalar>, const DerTypeCleaned> >(
-    std::pow(x.value(),y),
-    x.derivatives() * (y * std::pow(x.value(),y-1)));
+  using std::pow;
+  return Eigen::MakeAutoDiffScalar(pow(x.value(),y), x.derivatives() * (y * pow(x.value(),y-1)));
 }
 
 
@@ -634,27 +636,44 @@ atan2(const AutoDiffScalar<DerTypeA>& a, const AutoDiffScalar<DerTypeB>& b)
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tan,
   using std::tan;
   using std::cos;
-  return ReturnType(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(tan(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cos(x.value()))));)
 
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(asin,
   using std::sqrt;
   using std::asin;
-  return ReturnType(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(asin(x.value()),x.derivatives() * (Scalar(1)/sqrt(1-numext::abs2(x.value()))));)
   
 EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(acos,
   using std::sqrt;
   using std::acos;
-  return ReturnType(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+  return Eigen::MakeAutoDiffScalar(acos(x.value()),x.derivatives() * (Scalar(-1)/sqrt(1-numext::abs2(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(tanh,
+  using std::cosh;
+  using std::tanh;
+  return Eigen::MakeAutoDiffScalar(tanh(x.value()),x.derivatives() * (Scalar(1)/numext::abs2(cosh(x.value()))));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(sinh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(sinh(x.value()),x.derivatives() * cosh(x.value()));)
+
+EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY(cosh,
+  using std::sinh;
+  using std::cosh;
+  return Eigen::MakeAutoDiffScalar(cosh(x.value()),x.derivatives() * sinh(x.value()));)
 
 #undef EIGEN_AUTODIFF_DECLARE_GLOBAL_UNARY
 
 template<typename DerType> struct NumTraits<AutoDiffScalar<DerType> >
-  : NumTraits< typename NumTraits<typename DerType::Scalar>::Real >
+  : NumTraits< typename NumTraits<typename internal::remove_all<DerType>::type::Scalar>::Real >
 {
-  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerType::Scalar>::Real,DerType::RowsAtCompileTime,DerType::ColsAtCompileTime,
-                                DerType::Options, DerType::MaxRowsAtCompileTime, DerType::MaxColsAtCompileTime> > Real;
+  typedef typename internal::remove_all<DerType>::type DerTypeCleaned;
+  typedef AutoDiffScalar<Matrix<typename NumTraits<typename DerTypeCleaned::Scalar>::Real,DerTypeCleaned::RowsAtCompileTime,DerTypeCleaned::ColsAtCompileTime,
+                                0, DerTypeCleaned::MaxRowsAtCompileTime, DerTypeCleaned::MaxColsAtCompileTime> > Real;
   typedef AutoDiffScalar<DerType> NonInteger;
   typedef AutoDiffScalar<DerType> Nested;
+  typedef typename NumTraits<typename DerTypeCleaned::Scalar>::Literal Literal;
   enum{
     RequireInitialization = 1
   };
diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
deleted file mode 100644
index ad91fd9c4..000000000
--- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_AutoDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_AutoDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt
deleted file mode 100644
index b377d865c..000000000
--- a/unsupported/Eigen/src/BVH/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_BVH_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_BVH_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt
deleted file mode 100644
index 754953335..000000000
--- a/unsupported/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-ADD_SUBDIRECTORY(AutoDiff)
-ADD_SUBDIRECTORY(BVH)
-ADD_SUBDIRECTORY(Eigenvalues)
-ADD_SUBDIRECTORY(FFT)
-ADD_SUBDIRECTORY(IterativeSolvers)
-ADD_SUBDIRECTORY(LevenbergMarquardt)
-ADD_SUBDIRECTORY(MatrixFunctions)
-ADD_SUBDIRECTORY(MoreVectorization)
-ADD_SUBDIRECTORY(NonLinearOptimization)
-ADD_SUBDIRECTORY(NumericalDiff)
-ADD_SUBDIRECTORY(Polynomials)
-ADD_SUBDIRECTORY(Skyline)
-ADD_SUBDIRECTORY(SparseExtra)
-ADD_SUBDIRECTORY(KroneckerProduct)
-ADD_SUBDIRECTORY(Splines)
-ADD_SUBDIRECTORY(EulerAngles)
diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
deleted file mode 100644
index 1d4387c82..000000000
--- a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Eigenvalues_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Eigenvalues_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt
deleted file mode 100644
index edcffcb18..000000000
--- a/unsupported/Eigen/src/FFT/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_FFT_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_FFT_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
deleted file mode 100644
index 7986afc5e..000000000
--- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_IterativeSolvers_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
deleted file mode 100644
index 4daefebee..000000000
--- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_KroneckerProduct_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
index bf9727c21..582fa8512 100644
--- a/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
+++ b/unsupported/Eigen/src/KroneckerProduct/KroneckerTensorProduct.h
@@ -203,7 +203,7 @@ struct traits<KroneckerProduct<_Lhs,_Rhs> >
 {
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
   typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
   enum {
@@ -222,7 +222,7 @@ struct traits<KroneckerProductSparse<_Lhs,_Rhs> >
   typedef MatrixXpr XprKind;
   typedef typename remove_all<_Lhs>::type Lhs;
   typedef typename remove_all<_Rhs>::type Rhs;
-  typedef typename scalar_product_traits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
+  typedef typename ScalarBinaryOpTraits<typename Lhs::Scalar, typename Rhs::Scalar>::ReturnType Scalar;
   typedef typename cwise_promote_storage_type<typename traits<Lhs>::StorageKind, typename traits<Rhs>::StorageKind, scalar_product_op<typename Lhs::Scalar, typename Rhs::Scalar> >::ret StorageKind;
   typedef typename promote_index_type<typename Lhs::StorageIndex, typename Rhs::StorageIndex>::type StorageIndex;
 
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
deleted file mode 100644
index d9690854d..000000000
--- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_LevenbergMarquardt_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
deleted file mode 100644
index cdde64d2c..000000000
--- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MatrixFunctions_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
index 9f08c6162..afd88ec4d 100644
--- a/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
+++ b/unsupported/Eigen/src/MatrixFunctions/MatrixSquareRoot.h
@@ -65,21 +65,6 @@ void matrix_sqrt_quasi_triangular_2x1_off_diagonal_block(const MatrixType& T, ty
   sqrtT.template block<2,1>(i,j) = A.fullPivLu().solve(rhs);
 }
 
-// similar to compute1x1offDiagonalBlock()
-template <typename MatrixType, typename ResultType>
-void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
-{
-  typedef typename traits<MatrixType>::Scalar Scalar;
-  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
-  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
-  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
-  if (j-i > 2)
-    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
-  Matrix<Scalar,2,2> X;
-  matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
-  sqrtT.template block<2,2>(i,j) = X;
-}
-
 // solves the equation A X + X B = C where all matrices are 2-by-2
 template <typename MatrixType>
 void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const MatrixType& A, const MatrixType& B, const MatrixType& C)
@@ -98,13 +83,13 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
   coeffMatrix.coeffRef(2,3) = B.coeff(1,0);
   coeffMatrix.coeffRef(3,1) = A.coeff(1,0);
   coeffMatrix.coeffRef(3,2) = B.coeff(0,1);
-  
+
   Matrix<Scalar,4,1> rhs;
   rhs.coeffRef(0) = C.coeff(0,0);
   rhs.coeffRef(1) = C.coeff(0,1);
   rhs.coeffRef(2) = C.coeff(1,0);
   rhs.coeffRef(3) = C.coeff(1,1);
-  
+
   Matrix<Scalar,4,1> result;
   result = coeffMatrix.fullPivLu().solve(rhs);
 
@@ -114,6 +99,20 @@ void matrix_sqrt_quasi_triangular_solve_auxiliary_equation(MatrixType& X, const
   X.coeffRef(1,1) = result.coeff(3);
 }
 
+// similar to compute1x1offDiagonalBlock()
+template <typename MatrixType, typename ResultType>
+void matrix_sqrt_quasi_triangular_2x2_off_diagonal_block(const MatrixType& T, typename MatrixType::Index i, typename MatrixType::Index j, ResultType& sqrtT)
+{
+  typedef typename traits<MatrixType>::Scalar Scalar;
+  Matrix<Scalar,2,2> A = sqrtT.template block<2,2>(i,i);
+  Matrix<Scalar,2,2> B = sqrtT.template block<2,2>(j,j);
+  Matrix<Scalar,2,2> C = T.template block<2,2>(i,j);
+  if (j-i > 2)
+    C -= sqrtT.block(i, i+2, 2, j-i-2) * sqrtT.block(i+2, j, j-i-2, 2);
+  Matrix<Scalar,2,2> X;
+  matrix_sqrt_quasi_triangular_solve_auxiliary_equation(X, A, B, C);
+  sqrtT.template block<2,2>(i,j) = X;
+}
 
 // pre:  T is quasi-upper-triangular and sqrtT is a zero matrix of the same size
 // post: the diagonal blocks of sqrtT are the square roots of the diagonal blocks of T
diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
deleted file mode 100644
index 1b887cc8e..000000000
--- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MoreVectorization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MoreVectorization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
deleted file mode 100644
index 9322ddadf..000000000
--- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NonLinearOptimization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
deleted file mode 100644
index 1199aca2f..000000000
--- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NumericalDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NumericalDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt
deleted file mode 100644
index 51f13f3cb..000000000
--- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Polynomials_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Polynomials_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt
deleted file mode 100644
index 3bf1b0dd4..000000000
--- a/unsupported/Eigen/src/Skyline/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Skyline_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Skyline_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
deleted file mode 100644
index 7ea32ca5e..000000000
--- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseExtra_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseExtra_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
new file mode 100644
index 000000000..ed415db99
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsArrayAPI.h
@@ -0,0 +1,124 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+
+#ifndef EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+#define EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
+
+namespace Eigen {
+
+/** \cpp11 \returns an expression of the coefficient-wise igamma(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise incomplete gamma function.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igammac(), Eigen::lgamma()
+  */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igamma(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igamma_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise igammac(\a a, \a x) to the given arrays.
+  *
+  * This function computes the coefficient-wise complementary incomplete gamma function.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of igammac(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::igamma(), Eigen::lgamma()
+  */
+template<typename Derived,typename ExponentDerived>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>
+igammac(const Eigen::ArrayBase<Derived>& a, const Eigen::ArrayBase<ExponentDerived>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_igammac_op<typename Derived::Scalar>, const Derived, const ExponentDerived>(
+    a.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise polygamma(\a n, \a x) to the given arrays.
+  *
+  * It returns the \a n -th derivative of the digamma(psi) evaluated at \c x.
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of polygamma(T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::digamma()
+  */
+// * \warning Be careful with the order of the parameters: x.polygamma(n) is equivalent to polygamma(n,x)
+// * \sa ArrayBase::polygamma()
+template<typename DerivedN,typename DerivedX>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>
+polygamma(const Eigen::ArrayBase<DerivedN>& n, const Eigen::ArrayBase<DerivedX>& x)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_polygamma_op<typename DerivedX::Scalar>, const DerivedN, const DerivedX>(
+    n.derived(),
+    x.derived()
+  );
+}
+
+/** \cpp11 \returns an expression of the coefficient-wise betainc(\a x, \a a, \a b) to the given arrays.
+  *
+  * This function computes the regularized incomplete beta function (integral).
+  *
+  * \note This function supports only float and double scalar types in c++11 mode. To support other scalar types,
+  * or float/double in non c++11 mode, the user has to provide implementations of betainc(T,T,T) for any scalar
+  * type T to be supported.
+  *
+  * \sa Eigen::betainc(), Eigen::lgamma()
+  */
+template<typename ArgADerived, typename ArgBDerived, typename ArgXDerived>
+inline const Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>
+betainc(const Eigen::ArrayBase<ArgADerived>& a, const Eigen::ArrayBase<ArgBDerived>& b, const Eigen::ArrayBase<ArgXDerived>& x)
+{
+  return Eigen::CwiseTernaryOp<Eigen::internal::scalar_betainc_op<typename ArgXDerived::Scalar>, const ArgADerived, const ArgBDerived, const ArgXDerived>(
+    a.derived(),
+    b.derived(),
+    x.derived()
+  );
+}
+
+
+/** \returns an expression of the coefficient-wise zeta(\a x, \a q) to the given arrays.
+  *
+  * It returns the Riemann zeta function of two arguments \a x and \a q:
+  *
+  * \param x is the exposent, it must be > 1
+  * \param q is the shift, it must be > 0
+  *
+  * \note This function supports only float and double scalar types. To support other scalar types, the user has
+  * to provide implementations of zeta(T,T) for any scalar type T to be supported.
+  *
+  * \sa ArrayBase::zeta()
+  */
+template<typename DerivedX,typename DerivedQ>
+inline const Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>
+zeta(const Eigen::ArrayBase<DerivedX>& x, const Eigen::ArrayBase<DerivedQ>& q)
+{
+  return Eigen::CwiseBinaryOp<Eigen::internal::scalar_zeta_op<typename DerivedX::Scalar>, const DerivedX, const DerivedQ>(
+    x.derived(),
+    q.derived()
+  );
+}
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_ARRAYAPI_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
new file mode 100644
index 000000000..d8f2363be
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsFunctors.h
@@ -0,0 +1,236 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Eugene Brevdo <ebrevdo@gmail.com>
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+#define EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
+
+namespace Eigen {
+
+namespace internal {
+
+
+/** \internal
+  * \brief Template functor to compute the incomplete gamma function igamma(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igamma
+  */
+template<typename Scalar> struct scalar_igamma_op : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igamma_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igamma; return igamma(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const {
+    return internal::pigamma(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igamma_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGamma
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the complementary incomplete gamma function igammac(a, x)
+  *
+  * \sa class CwiseBinaryOp, Cwise::igammac
+  */
+template<typename Scalar> struct scalar_igammac_op : binary_op_base<Scalar,Scalar>
+{
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_igammac_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& a, const Scalar& x) const {
+    using numext::igammac; return igammac(a, x);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& a, const Packet& x) const
+  {
+    return internal::pigammac(a, x);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_igammac_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 20 * NumTraits<Scalar>::MulCost + 10 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasIGammac
+  };
+};
+
+
+/** \internal
+  * \brief Template functor to compute the incomplete beta integral betainc(a, b, x)
+  *
+  */
+template<typename Scalar> struct scalar_betainc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_betainc_op)
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Scalar operator() (const Scalar& x, const Scalar& a, const Scalar& b) const {
+    using numext::betainc; return betainc(x, a, b);
+  }
+  template<typename Packet>
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Packet packetOp(const Packet& x, const Packet& a, const Packet& b) const
+  {
+    return internal::pbetainc(x, a, b);
+  }
+};
+template<typename Scalar>
+struct functor_traits<scalar_betainc_op<Scalar> > {
+  enum {
+    // Guesstimate
+    Cost = 400 * NumTraits<Scalar>::MulCost + 400 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasBetaInc
+  };
+};
+
+
+/** \internal
+ * \brief Template functor to compute the natural log of the absolute
+ * value of Gamma of a scalar
+ * \sa class CwiseUnaryOp, Cwise::lgamma()
+ */
+template<typename Scalar> struct scalar_lgamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_lgamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::lgamma; return lgamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::plgamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_lgamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasLGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute psi, the derivative of lgamma of a scalar.
+ * \sa class CwiseUnaryOp, Cwise::digamma()
+ */
+template<typename Scalar> struct scalar_digamma_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_digamma_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::digamma; return digamma(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::pdigamma(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_digamma_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasDiGamma
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Riemann Zeta function of two arguments.
+ * \sa class CwiseUnaryOp, Cwise::zeta()
+ */
+template<typename Scalar> struct scalar_zeta_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_zeta_op)
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& x, const Scalar& q) const {
+        using numext::zeta; return zeta(x, q);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& x, const Packet& q) const { return internal::pzeta(x, q); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_zeta_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasZeta
+    };
+};
+
+/** \internal
+ * \brief Template functor to compute the polygamma function.
+ * \sa class CwiseUnaryOp, Cwise::polygamma()
+ */
+template<typename Scalar> struct scalar_polygamma_op {
+    EIGEN_EMPTY_STRUCT_CTOR(scalar_polygamma_op)
+    EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& n, const Scalar& x) const {
+        using numext::polygamma; return polygamma(n, x);
+    }
+    typedef typename packet_traits<Scalar>::type Packet;
+    EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& n, const Packet& x) const { return internal::ppolygamma(n, x); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_polygamma_op<Scalar> >
+{
+    enum {
+        // Guesstimate
+        Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+        PacketAccess = packet_traits<Scalar>::HasPolygamma
+    };
+};
+
+/** \internal
+ * \brief Template functor to compute the Gauss error function of a
+ * scalar
+ * \sa class CwiseUnaryOp, Cwise::erf()
+ */
+template<typename Scalar> struct scalar_erf_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erf_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erf; return erf(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perf(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erf_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErf
+  };
+};
+
+/** \internal
+ * \brief Template functor to compute the Complementary Error Function
+ * of a scalar
+ * \sa class CwiseUnaryOp, Cwise::erfc()
+ */
+template<typename Scalar> struct scalar_erfc_op {
+  EIGEN_EMPTY_STRUCT_CTOR(scalar_erfc_op)
+  EIGEN_DEVICE_FUNC inline const Scalar operator() (const Scalar& a) const {
+    using numext::erfc; return erfc(a);
+  }
+  typedef typename packet_traits<Scalar>::type Packet;
+  EIGEN_DEVICE_FUNC inline Packet packetOp(const Packet& a) const { return internal::perfc(a); }
+};
+template<typename Scalar>
+struct functor_traits<scalar_erfc_op<Scalar> >
+{
+  enum {
+    // Guesstimate
+    Cost = 10 * NumTraits<Scalar>::MulCost + 5 * NumTraits<Scalar>::AddCost,
+    PacketAccess = packet_traits<Scalar>::HasErfc
+  };
+};
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_FUNCTORS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
new file mode 100644
index 000000000..553bcda6a
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsHalf.h
@@ -0,0 +1,47 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_HALF_H
+#define EIGEN_SPECIALFUNCTIONS_HALF_H
+
+namespace Eigen {
+namespace numext {
+
+#if EIGEN_HAS_C99_MATH
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half lgamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::lgamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half digamma(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::digamma(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half zeta(const Eigen::half& x, const Eigen::half& q) {
+  return Eigen::half(Eigen::numext::zeta(static_cast<float>(x), static_cast<float>(q)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half polygamma(const Eigen::half& n, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::polygamma(static_cast<float>(n), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erf(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erf(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half erfc(const Eigen::half& a) {
+  return Eigen::half(Eigen::numext::erfc(static_cast<float>(a)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igamma(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igamma(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half igammac(const Eigen::half& a, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::igammac(static_cast<float>(a), static_cast<float>(x)));
+}
+template<> EIGEN_STRONG_INLINE EIGEN_DEVICE_FUNC Eigen::half betainc(const Eigen::half& a, const Eigen::half& b, const Eigen::half& x) {
+  return Eigen::half(Eigen::numext::betainc(static_cast<float>(a), static_cast<float>(b), static_cast<float>(x)));
+}
+#endif
+
+}  // end namespace numext
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIALFUNCTIONS_HALF_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
new file mode 100644
index 000000000..52619fc0c
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsImpl.h
@@ -0,0 +1,1551 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Eugene Brevdo <ebrevdo@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIAL_FUNCTIONS_H
+#define EIGEN_SPECIAL_FUNCTIONS_H
+
+namespace Eigen {
+namespace internal {
+
+//  Parts of this code are based on the Cephes Math Library.
+//
+//  Cephes Math Library Release 2.8:  June, 2000
+//  Copyright 1984, 1987, 1992, 2000 by Stephen L. Moshier
+//
+//  Permission has been kindly provided by the original author
+//  to incorporate the Cephes software into the Eigen codebase:
+//
+//    From: Stephen Moshier
+//    To: Eugene Brevdo
+//    Subject: Re: Permission to wrap several cephes functions in Eigen
+//
+//    Hello Eugene,
+//
+//    Thank you for writing.
+//
+//    If your licensing is similar to BSD, the formal way that has been
+//    handled is simply to add a statement to the effect that you are incorporating
+//    the Cephes software by permission of the author.
+//
+//    Good luck with your project,
+//    Steve
+
+namespace cephes {
+
+/* polevl (modified for Eigen)
+ *
+ *      Evaluate polynomial
+ *
+ *
+ *
+ * SYNOPSIS:
+ *
+ * int N;
+ * Scalar x, y, coef[N+1];
+ *
+ * y = polevl<decltype(x), N>( x, coef);
+ *
+ *
+ *
+ * DESCRIPTION:
+ *
+ * Evaluates polynomial of degree N:
+ *
+ *                     2          N
+ * y  =  C  + C x + C x  +...+ C x
+ *        0    1     2          N
+ *
+ * Coefficients are stored in reverse order:
+ *
+ * coef[0] = C  , ..., coef[N] = C  .
+ *            N                   0
+ *
+ *  The function p1evl() assumes that coef[N] = 1.0 and is
+ * omitted from the array.  Its calling arguments are
+ * otherwise the same as polevl().
+ *
+ *
+ * The Eigen implementation is templatized.  For best speed, store
+ * coef as a const array (constexpr), e.g.
+ *
+ * const double coef[] = {1.0, 2.0, 3.0, ...};
+ *
+ */
+template <typename Scalar, int N>
+struct polevl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar x, const Scalar coef[]) {
+    EIGEN_STATIC_ASSERT((N > 0), YOU_MADE_A_PROGRAMMING_MISTAKE);
+
+    return polevl<Scalar, N - 1>::run(x, coef) * x + coef[N];
+  }
+};
+
+template <typename Scalar>
+struct polevl<Scalar, 0> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar, const Scalar coef[]) {
+    return coef[0];
+  }
+};
+
+}  // end namespace cephes
+
+/****************************************************************************
+ * Implementation of lgamma, requires C++11/C99                             *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct lgamma_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct lgamma_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct lgamma_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(float x) { return ::lgammaf(x); }
+};
+
+template <>
+struct lgamma_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(double x) { return ::lgamma(x); }
+};
+#endif
+
+/****************************************************************************
+ * Implementation of digamma (psi), based on Cephes                         *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct digamma_retval {
+  typedef Scalar type;
+};
+
+/*
+ *
+ * Polynomial evaluation helper for the Psi (digamma) function.
+ *
+ * digamma_impl_maybe_poly::run(s) evaluates the asymptotic Psi expansion for
+ * input Scalar s, assuming s is above 10.0.
+ *
+ * If s is above a certain threshold for the given Scalar type, zero
+ * is returned.  Otherwise the polynomial is evaluated with enough
+ * coefficients for results matching Scalar machine precision.
+ *
+ *
+ */
+template <typename Scalar>
+struct digamma_impl_maybe_poly {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+
+template <>
+struct digamma_impl_maybe_poly<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float s) {
+    const float A[] = {
+      -4.16666666666666666667E-3f,
+      3.96825396825396825397E-3f,
+      -8.33333333333333333333E-3f,
+      8.33333333333333333333E-2f
+    };
+
+    float z;
+    if (s < 1.0e8f) {
+      z = 1.0f / (s * s);
+      return z * cephes::polevl<float, 3>::run(z, A);
+    } else return 0.0f;
+  }
+};
+
+template <>
+struct digamma_impl_maybe_poly<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double s) {
+    const double A[] = {
+      8.33333333333333333333E-2,
+      -2.10927960927960927961E-2,
+      7.57575757575757575758E-3,
+      -4.16666666666666666667E-3,
+      3.96825396825396825397E-3,
+      -8.33333333333333333333E-3,
+      8.33333333333333333333E-2
+    };
+
+    double z;
+    if (s < 1.0e17) {
+      z = 1.0 / (s * s);
+      return z * cephes::polevl<double, 6>::run(z, A);
+    }
+    else return 0.0;
+  }
+};
+
+template <typename Scalar>
+struct digamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar x) {
+    /*
+     *
+     *     Psi (digamma) function (modified for Eigen)
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double x, y, psi();
+     *
+     * y = psi( x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     *              d      -
+     *   psi(x)  =  -- ln | (x)
+     *              dx
+     *
+     * is the logarithmic derivative of the gamma function.
+     * For integer x,
+     *                   n-1
+     *                    -
+     * psi(n) = -EUL  +   >  1/k.
+     *                    -
+     *                   k=1
+     *
+     * If x is negative, it is transformed to a positive argument by the
+     * reflection formula  psi(1-x) = psi(x) + pi cot(pi x).
+     * For general positive x, the argument is made greater than 10
+     * using the recurrence  psi(x+1) = psi(x) + 1/x.
+     * Then the following asymptotic expansion is applied:
+     *
+     *                           inf.   B
+     *                            -      2k
+     * psi(x) = log(x) - 1/2x -   >   -------
+     *                            -        2k
+     *                           k=1   2k x
+     *
+     * where the B2k are Bernoulli numbers.
+     *
+     * ACCURACY (float):
+     *    Relative error (except absolute when |psi| < 1):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       1.3e-15     1.4e-16
+     *    IEEE      -30,0       40000       1.5e-15     2.2e-16
+     *
+     * ACCURACY (double):
+     *    Absolute error,  relative when |psi| > 1 :
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      -33,0        30000      8.2e-7      1.2e-7
+     *    IEEE      0,33        100000      7.3e-7      7.7e-8
+     *
+     * ERROR MESSAGES:
+     *     message         condition      value returned
+     * psi singularity    x integer <=0      INFINITY
+     */
+
+    Scalar p, q, nz, s, w, y;
+    bool negative = false;
+
+    const Scalar maxnum = NumTraits<Scalar>::infinity();
+    const Scalar m_pi = Scalar(EIGEN_PI);
+
+    const Scalar zero = Scalar(0);
+    const Scalar one = Scalar(1);
+    const Scalar half = Scalar(0.5);
+    nz = zero;
+
+    if (x <= zero) {
+      negative = true;
+      q = x;
+      p = numext::floor(q);
+      if (p == q) {
+        return maxnum;
+      }
+      /* Remove the zeros of tan(m_pi x)
+       * by subtracting the nearest integer from x
+       */
+      nz = q - p;
+      if (nz != half) {
+        if (nz > half) {
+          p += one;
+          nz = q - p;
+        }
+        nz = m_pi / numext::tan(m_pi * nz);
+      }
+      else {
+        nz = zero;
+      }
+      x = one - x;
+    }
+
+    /* use the recurrence psi(x+1) = psi(x) + 1/x. */
+    s = x;
+    w = zero;
+    while (s < Scalar(10)) {
+      w += one / s;
+      s += one;
+    }
+
+    y = digamma_impl_maybe_poly<Scalar>::run(s);
+
+    y = numext::log(s) - (half / s) - y - w;
+
+    return (negative) ? y - nz : y;
+  }
+};
+
+/****************************************************************************
+ * Implementation of erf, requires C++11/C99                                *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct erf_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct erf_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erf_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(float x) { return ::erff(x); }
+};
+
+template <>
+struct erf_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(double x) { return ::erf(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/***************************************************************************
+* Implementation of erfc, requires C++11/C99                               *
+****************************************************************************/
+
+template <typename Scalar>
+struct erfc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <typename Scalar>
+struct erfc_retval {
+  typedef Scalar type;
+};
+
+#if EIGEN_HAS_C99_MATH
+template <>
+struct erfc_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float run(const float x) { return ::erfcf(x); }
+};
+
+template <>
+struct erfc_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double run(const double x) { return ::erfc(x); }
+};
+#endif  // EIGEN_HAS_C99_MATH
+
+/**************************************************************************************************************
+ * Implementation of igammac (complemented incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ **************************************************************************************************************/
+
+template <typename Scalar>
+struct igammac_retval {
+  typedef Scalar type;
+};
+
+// NOTE: cephes_helper is also used to implement zeta
+template <typename Scalar>
+struct cephes_helper {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar machep() { assert(false && "machep not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar big() { assert(false && "big not supported for this type"); return 0.0; }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar biginv() { assert(false && "biginv not supported for this type"); return 0.0; }
+};
+
+template <>
+struct cephes_helper<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float machep() {
+    return NumTraits<float>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float big() {
+    // use epsneg (1.0 - epsneg == 1.0)
+    return 1.0f / (NumTraits<float>::epsilon() / 2);
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float biginv() {
+    // epsneg
+    return machep();
+  }
+};
+
+template <>
+struct cephes_helper<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double machep() {
+    return NumTraits<double>::epsilon() / 2;  // 1.0 - machep == 1.0
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double big() {
+    return 1.0 / NumTraits<double>::epsilon();
+  }
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double biginv() {
+    // inverse of eps
+    return NumTraits<double>::epsilon();
+  }
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar> struct igamma_impl;  // predeclare igamma_impl
+
+template <typename Scalar>
+struct igammac_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    /*  igamc()
+     *
+     *	Incomplete gamma integral (modified for Eigen)
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igamc();
+     *
+     * y = igamc( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *
+     *  igamc(a,x)   =   1 - igam(a,x)
+     *
+     *                            inf.
+     *                              -
+     *                     1       | |  -t  a-1
+     *               =   -----     |   e   t   dt.
+     *                    -      | |
+     *                   | (a)    -
+     *                             x
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        30000       7.8e-6      5.9e-7
+     *
+     *
+     * ACCURACY (double):
+     *
+     * Tested at random a, x.
+     *                a         x                      Relative error:
+     * arithmetic   domain   domain     # trials      peak         rms
+     *    IEEE     0.5,100   0,100      200000       1.9e-14     1.7e-15
+     *    IEEE     0.01,0.5  0,100      200000       1.4e-13     1.6e-15
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if ((x < zero) || (a <= zero)) {
+      // domain error
+      return nan;
+    }
+
+    if ((x < one) || (x < a)) {
+      /* The checks above ensure that we meet the preconditions for
+       * igamma_impl::Impl(), so call it, rather than igamma_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igamma_impl<Scalar>::Impl(a, x));
+    }
+
+    return Impl(a, x);
+  }
+
+ private:
+  /* igamma_impl calls igammac_impl::Impl. */
+  friend struct igamma_impl<Scalar>;
+
+  /* Actually computes igamc(a, x).
+   *
+   * Preconditions:
+   *   a > 0
+   *   x >= 1
+   *   x >= a
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+    const Scalar inf = NumTraits<Scalar>::infinity();
+
+    Scalar ans, ax, c, yc, r, t, y, z;
+    Scalar pk, pkm1, pkm2, qk, qkm1, qkm2;
+
+    if (x == inf) return zero;  // std::isinf crashes on CUDA
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {  // underflow
+      return zero;
+    }
+    ax = numext::exp(ax);
+
+    // continued fraction
+    y = one - a;
+    z = x + y + one;
+    c = zero;
+    pkm2 = one;
+    qkm2 = x;
+    pkm1 = x + one;
+    qkm1 = z * x;
+    ans = pkm1 / qkm1;
+
+    while (true) {
+      c += one;
+      y += one;
+      z += two;
+      yc = y * c;
+      pk = pkm1 * z - pkm2 * yc;
+      qk = qkm1 * z - qkm2 * yc;
+      if (qk != zero) {
+        r = pk / qk;
+        t = numext::abs((ans - r) / r);
+        ans = r;
+      } else {
+        t = one;
+      }
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+      if (numext::abs(pk) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if (t <= machep) {
+        break;
+      }
+    }
+
+    return (ans * ax);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of igamma (incomplete gamma integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct igamma_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct igamma_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar>
+struct igamma_impl {
+  EIGEN_DEVICE_FUNC
+  static Scalar run(Scalar a, Scalar x) {
+    /*	igam()
+     *	Incomplete gamma integral
+     *
+     *
+     *
+     * SYNOPSIS:
+     *
+     * double a, x, y, igam();
+     *
+     * y = igam( a, x );
+     *
+     * DESCRIPTION:
+     *
+     * The function is defined by
+     *
+     *                           x
+     *                            -
+     *                   1       | |  -t  a-1
+     *  igam(a,x)  =   -----     |   e   t   dt.
+     *                  -      | |
+     *                 | (a)    -
+     *                           0
+     *
+     *
+     * In this implementation both arguments must be positive.
+     * The integral is evaluated by either a power series or
+     * continued fraction expansion, depending on the relative
+     * values of a and x.
+     *
+     * ACCURACY (double):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30       200000       3.6e-14     2.9e-15
+     *    IEEE      0,100      300000       9.9e-14     1.5e-14
+     *
+     *
+     * ACCURACY (float):
+     *
+     *                      Relative error:
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,30        20000       7.8e-6      5.9e-7
+     *
+     */
+    /*
+      Cephes Math Library Release 2.2: June, 1992
+      Copyright 1985, 1987, 1992 by Stephen L. Moshier
+      Direct inquiries to 30 Frost Street, Cambridge, MA 02140
+    */
+
+
+    /* left tail of incomplete gamma function:
+     *
+     *          inf.      k
+     *   a  -x   -       x
+     *  x  e     >   ----------
+     *           -     -
+     *          k=0   | (a+k+1)
+     *
+     */
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+
+    if (x == zero) return zero;
+
+    if ((x < zero) || (a <= zero)) {  // domain error
+      return nan;
+    }
+
+    if ((x > one) && (x > a)) {
+      /* The checks above ensure that we meet the preconditions for
+       * igammac_impl::Impl(), so call it, rather than igammac_impl::Run().
+       * Calling Run() would also work, but in that case the compiler may not be
+       * able to prove that igammac_impl::Run and igamma_impl::Run are not
+       * mutually recursive.  This leads to worse code, particularly on
+       * platforms like nvptx, where recursion is allowed only begrudgingly.
+       */
+      return (one - igammac_impl<Scalar>::Impl(a, x));
+    }
+
+    return Impl(a, x);
+  }
+
+ private:
+  /* igammac_impl calls igamma_impl::Impl. */
+  friend struct igammac_impl<Scalar>;
+
+  /* Actually computes igam(a, x).
+   *
+   * Preconditions:
+   *   x > 0
+   *   a > 0
+   *   !(x > 1 && x > a)
+   */
+  EIGEN_DEVICE_FUNC static Scalar Impl(Scalar a, Scalar x) {
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar maxlog = numext::log(NumTraits<Scalar>::highest());
+
+    Scalar ans, ax, c, r;
+
+    /* Compute  x**a * exp(-x) / gamma(a)  */
+    ax = a * numext::log(x) - x - lgamma_impl<Scalar>::run(a);
+    if (ax < -maxlog) {
+      // underflow
+      return zero;
+    }
+    ax = numext::exp(ax);
+
+    /* power series */
+    r = a;
+    c = one;
+    ans = one;
+
+    while (true) {
+      r += one;
+      c *= x/r;
+      ans += c;
+      if (c/ans <= machep) {
+        break;
+      }
+    }
+
+    return (ans * ax / a);
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+/*****************************************************************************
+ * Implementation of Riemann zeta function of two arguments, based on Cephes *
+ *****************************************************************************/
+
+template <typename Scalar>
+struct zeta_retval {
+    typedef Scalar type;
+};
+
+template <typename Scalar>
+struct zeta_impl_series {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(const Scalar) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+template <>
+struct zeta_impl_series<float> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE bool run(float& a, float& b, float& s, const float x, const float machep) {
+    int i = 0;  
+    while(i < 9)
+    {
+        i += 1;
+        a += 1.0f;
+        b = numext::pow( a, -x );
+        s += b;
+        if( numext::abs(b/s) < machep )
+            return true;
+    }
+    
+    //Return whether we are done
+    return false;
+  }
+};
+
+template <>
+struct zeta_impl_series<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE bool run(double& a, double& b, double& s, const double x, const double machep) {
+    int i = 0;  
+    while( (i < 9) || (a <= 9.0) )
+    {
+        i += 1;
+        a += 1.0;
+        b = numext::pow( a, -x );
+        s += b;
+        if( numext::abs(b/s) < machep )
+            return true;
+    }
+    
+    //Return whether we are done
+    return false;
+  }
+};
+    
+template <typename Scalar>
+struct zeta_impl {
+    EIGEN_DEVICE_FUNC
+    static Scalar run(Scalar x, Scalar q) {
+        /*							zeta.c
+         *
+         *	Riemann zeta function of two arguments
+         *
+         *
+         *
+         * SYNOPSIS:
+         *
+         * double x, q, y, zeta();
+         *
+         * y = zeta( x, q );
+         *
+         *
+         *
+         * DESCRIPTION:
+         *
+         *
+         *
+         *                 inf.
+         *                  -        -x
+         *   zeta(x,q)  =   >   (k+q)
+         *                  -
+         *                 k=0
+         *
+         * where x > 1 and q is not a negative integer or zero.
+         * The Euler-Maclaurin summation formula is used to obtain
+         * the expansion
+         *
+         *                n
+         *                -       -x
+         * zeta(x,q)  =   >  (k+q)
+         *                -
+         *               k=1
+         *
+         *           1-x                 inf.  B   x(x+1)...(x+2j)
+         *      (n+q)           1         -     2j
+         *  +  ---------  -  -------  +   >    --------------------
+         *        x-1              x      -                   x+2j+1
+         *                   2(n+q)      j=1       (2j)! (n+q)
+         *
+         * where the B2j are Bernoulli numbers.  Note that (see zetac.c)
+         * zeta(x,1) = zetac(x) + 1.
+         *
+         *
+         *
+         * ACCURACY:
+         *
+         * Relative error for single precision:
+         * arithmetic   domain     # trials      peak         rms
+         *    IEEE      0,25        10000       6.9e-7      1.0e-7
+         *
+         * Large arguments may produce underflow in powf(), in which
+         * case the results are inaccurate.
+         *
+         * REFERENCE:
+         *
+         * Gradshteyn, I. S., and I. M. Ryzhik, Tables of Integrals,
+         * Series, and Products, p. 1073; Academic Press, 1980.
+         *
+         */
+        
+        int i;
+        Scalar p, r, a, b, k, s, t, w;
+        
+        const Scalar A[] = {
+            Scalar(12.0),
+            Scalar(-720.0),
+            Scalar(30240.0),
+            Scalar(-1209600.0),
+            Scalar(47900160.0),
+            Scalar(-1.8924375803183791606e9), /*1.307674368e12/691*/
+            Scalar(7.47242496e10),
+            Scalar(-2.950130727918164224e12), /*1.067062284288e16/3617*/
+            Scalar(1.1646782814350067249e14), /*5.109094217170944e18/43867*/
+            Scalar(-4.5979787224074726105e15), /*8.028576626982912e20/174611*/
+            Scalar(1.8152105401943546773e17), /*1.5511210043330985984e23/854513*/
+            Scalar(-7.1661652561756670113e18) /*1.6938241367317436694528e27/236364091*/
+            };
+            
+        const Scalar maxnum = NumTraits<Scalar>::infinity();
+        const Scalar zero = 0.0, half = 0.5, one = 1.0;
+        const Scalar machep = cephes_helper<Scalar>::machep();
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+        
+        if( x == one )
+            return maxnum;
+        
+        if( x < one )
+        {
+            return nan;
+        }
+        
+        if( q <= zero )
+        {
+            if(q == numext::floor(q))
+            {
+                return maxnum;
+            }
+            p = x;
+            r = numext::floor(p);
+            if (p != r)
+                return nan;
+        }
+        
+        /* Permit negative q but continue sum until n+q > +9 .
+         * This case should be handled by a reflection formula.
+         * If q<0 and x is an integer, there is a relation to
+         * the polygamma function.
+         */
+        s = numext::pow( q, -x );
+        a = q;
+        b = zero;
+        // Run the summation in a helper function that is specific to the floating precision
+        if (zeta_impl_series<Scalar>::run(a, b, s, x, machep)) {
+            return s;
+        }
+        
+        w = a;
+        s += b*w/(x-one);
+        s -= half * b;
+        a = one;
+        k = zero;
+        for( i=0; i<12; i++ )
+        {
+            a *= x + k;
+            b /= w;
+            t = a*b/A[i];
+            s = s + t;
+            t = numext::abs(t/s);
+            if( t < machep ) {
+              break;
+            }
+            k += one;
+            a *= x + k;
+            b /= w;
+            k += one;
+        }
+        return s;
+  }
+};
+
+/****************************************************************************
+ * Implementation of polygamma function, requires C++11/C99                 *
+ ****************************************************************************/
+
+template <typename Scalar>
+struct polygamma_retval {
+    typedef Scalar type;
+};
+    
+#if !EIGEN_HAS_C99_MATH
+    
+template <typename Scalar>
+struct polygamma_impl {
+    EIGEN_DEVICE_FUNC
+    static EIGEN_STRONG_INLINE Scalar run(Scalar n, Scalar x) {
+        EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                            THIS_TYPE_IS_NOT_SUPPORTED);
+        return Scalar(0);
+    }
+};
+    
+#else
+    
+template <typename Scalar>
+struct polygamma_impl {
+    EIGEN_DEVICE_FUNC
+    static Scalar run(Scalar n, Scalar x) {
+        Scalar zero = 0.0, one = 1.0;
+        Scalar nplus = n + one;
+        const Scalar nan = NumTraits<Scalar>::quiet_NaN();
+        
+        // Check that n is an integer
+        if (numext::floor(n) != n) {
+            return nan;
+        }
+        // Just return the digamma function for n = 1
+        else if (n == zero) {
+            return digamma_impl<Scalar>::run(x);
+        }
+        // Use the same implementation as scipy
+        else {
+            Scalar factorial = numext::exp(lgamma_impl<Scalar>::run(nplus));
+            return numext::pow(-one, nplus) * factorial * zeta_impl<Scalar>::run(nplus, x);
+        }
+  }
+};
+    
+#endif  // EIGEN_HAS_C99_MATH
+
+/************************************************************************************************
+ * Implementation of betainc (incomplete beta integral), based on Cephes but requires C++11/C99 *
+ ************************************************************************************************/
+
+template <typename Scalar>
+struct betainc_retval {
+  typedef Scalar type;
+};
+
+#if !EIGEN_HAS_C99_MATH
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+#else
+
+template <typename Scalar>
+struct betainc_impl {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar, Scalar, Scalar) {
+    /*	betaincf.c
+     *
+     *	Incomplete beta integral
+     *
+     *
+     * SYNOPSIS:
+     *
+     * float a, b, x, y, betaincf();
+     *
+     * y = betaincf( a, b, x );
+     *
+     *
+     * DESCRIPTION:
+     *
+     * Returns incomplete beta integral of the arguments, evaluated
+     * from zero to x.  The function is defined as
+     *
+     *                  x
+     *     -            -
+     *    | (a+b)      | |  a-1     b-1
+     *  -----------    |   t   (1-t)   dt.
+     *   -     -     | |
+     *  | (a) | (b)   -
+     *                 0
+     *
+     * The domain of definition is 0 <= x <= 1.  In this
+     * implementation a and b are restricted to positive values.
+     * The integral from x to 1 may be obtained by the symmetry
+     * relation
+     *
+     *    1 - betainc( a, b, x )  =  betainc( b, a, 1-x ).
+     *
+     * The integral is evaluated by a continued fraction expansion.
+     * If a < 1, the function calls itself recursively after a
+     * transformation to increase a to a+1.
+     *
+     * ACCURACY (float):
+     *
+     * Tested at random points (a,b,x) with a and b in the indicated
+     * interval and x between 0 and 1.
+     *
+     * arithmetic   domain     # trials      peak         rms
+     * Relative error:
+     *    IEEE       0,30       10000       3.7e-5      5.1e-6
+     *    IEEE       0,100      10000       1.7e-4      2.5e-5
+     * The useful domain for relative error is limited by underflow
+     * of the single precision exponential function.
+     * Absolute error:
+     *    IEEE       0,30      100000       2.2e-5      9.6e-7
+     *    IEEE       0,100      10000       6.5e-5      3.7e-6
+     *
+     * Larger errors may occur for extreme ratios of a and b.
+     *
+     * ACCURACY (double):
+     * arithmetic   domain     # trials      peak         rms
+     *    IEEE      0,5         10000       6.9e-15     4.5e-16
+     *    IEEE      0,85       250000       2.2e-13     1.7e-14
+     *    IEEE      0,1000      30000       5.3e-12     6.3e-13
+     *    IEEE      0,10000    250000       9.3e-11     7.1e-12
+     *    IEEE      0,100000    10000       8.7e-10     4.8e-11
+     * Outputs smaller than the IEEE gradual underflow threshold
+     * were excluded from these statistics.
+     *
+     * ERROR MESSAGES:
+     *   message         condition      value returned
+     * incbet domain      x<0, x>1          nan
+     * incbet underflow                     nan
+     */
+
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, Scalar>::value == false),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    return Scalar(0);
+  }
+};
+
+/* Continued fraction expansion #1 for incomplete beta integral (small_branch = True)
+ * Continued fraction expansion #2 for incomplete beta integral (small_branch = False)
+ */
+template <typename Scalar>
+struct incbeta_cfe {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE Scalar run(Scalar a, Scalar b, Scalar x, bool small_branch) {
+    EIGEN_STATIC_ASSERT((internal::is_same<Scalar, float>::value ||
+                         internal::is_same<Scalar, double>::value),
+                        THIS_TYPE_IS_NOT_SUPPORTED);
+    const Scalar big = cephes_helper<Scalar>::big();
+    const Scalar machep = cephes_helper<Scalar>::machep();
+    const Scalar biginv = cephes_helper<Scalar>::biginv();
+
+    const Scalar zero = 0;
+    const Scalar one = 1;
+    const Scalar two = 2;
+
+    Scalar xk, pk, pkm1, pkm2, qk, qkm1, qkm2;
+    Scalar k1, k2, k3, k4, k5, k6, k7, k8, k26update;
+    Scalar ans;
+    int n;
+
+    const int num_iters = (internal::is_same<Scalar, float>::value) ? 100 : 300;
+    const Scalar thresh =
+        (internal::is_same<Scalar, float>::value) ? machep : Scalar(3) * machep;
+    Scalar r = (internal::is_same<Scalar, float>::value) ? zero : one;
+
+    if (small_branch) {
+      k1 = a;
+      k2 = a + b;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = b - one;
+      k7 = k4;
+      k8 = a + two;
+      k26update = one;
+    } else {
+      k1 = a;
+      k2 = b - one;
+      k3 = a;
+      k4 = a + one;
+      k5 = one;
+      k6 = a + b;
+      k7 = a + one;
+      k8 = a + two;
+      k26update = -one;
+      x = x / (one - x);
+    }
+
+    pkm2 = zero;
+    qkm2 = one;
+    pkm1 = one;
+    qkm1 = one;
+    ans = one;
+    n = 0;
+
+    do {
+      xk = -(x * k1 * k2) / (k3 * k4);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      xk = (x * k5 * k6) / (k7 * k8);
+      pk = pkm1 + pkm2 * xk;
+      qk = qkm1 + qkm2 * xk;
+      pkm2 = pkm1;
+      pkm1 = pk;
+      qkm2 = qkm1;
+      qkm1 = qk;
+
+      if (qk != zero) {
+        r = pk / qk;
+        if (numext::abs(ans - r) < numext::abs(r) * thresh) {
+          return r;
+        }
+        ans = r;
+      }
+
+      k1 += one;
+      k2 += k26update;
+      k3 += two;
+      k4 += two;
+      k5 += one;
+      k6 -= k26update;
+      k7 += two;
+      k8 += two;
+
+      if ((numext::abs(qk) + numext::abs(pk)) > big) {
+        pkm2 *= biginv;
+        pkm1 *= biginv;
+        qkm2 *= biginv;
+        qkm1 *= biginv;
+      }
+      if ((numext::abs(qk) < biginv) || (numext::abs(pk) < biginv)) {
+        pkm2 *= big;
+        pkm1 *= big;
+        qkm2 *= big;
+        qkm1 *= big;
+      }
+    } while (++n < num_iters);
+
+    return ans;
+  }
+};
+
+/* Helper functions depending on the Scalar type */
+template <typename Scalar>
+struct betainc_helper {};
+
+template <>
+struct betainc_helper<float> {
+  /* Core implementation, assumes a large (> 1.0) */
+  EIGEN_DEVICE_FUNC static EIGEN_STRONG_INLINE float incbsa(float aa, float bb,
+                                                            float xx) {
+    float ans, a, b, t, x, onemx;
+    bool reversed_a_b = false;
+
+    onemx = 1.0f - xx;
+
+    /* see if x is greater than the mean */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      t = xx;
+      x = onemx;
+    } else {
+      a = aa;
+      b = bb;
+      t = onemx;
+      x = xx;
+    }
+
+    /* Choose expansion for optimal convergence */
+    if (b > 10.0f) {
+      if (numext::abs(b * x / a) < 0.3f) {
+        t = betainc_helper<float>::incbps(a, b, x);
+        if (reversed_a_b) t = 1.0f - t;
+        return t;
+      }
+    }
+
+    ans = x * (a + b - 2.0f) / (a - 1.0f);
+    if (ans < 1.0f) {
+      ans = incbeta_cfe<float>::run(a, b, x, true /* small_branch */);
+      t = b * numext::log(t);
+    } else {
+      ans = incbeta_cfe<float>::run(a, b, x, false /* small_branch */);
+      t = (b - 1.0f) * numext::log(t);
+    }
+
+    t += a * numext::log(x) + lgamma_impl<float>::run(a + b) -
+         lgamma_impl<float>::run(a) - lgamma_impl<float>::run(b);
+    t += numext::log(ans / a);
+    t = numext::exp(t);
+
+    if (reversed_a_b) t = 1.0f - t;
+    return t;
+  }
+
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE float incbps(float a, float b, float x) {
+    float t, u, y, s;
+    const float machep = cephes_helper<float>::machep();
+
+    y = a * numext::log(x) + (b - 1.0f) * numext::log1p(-x) - numext::log(a);
+    y -= lgamma_impl<float>::run(a) + lgamma_impl<float>::run(b);
+    y += lgamma_impl<float>::run(a + b);
+
+    t = x / (1.0f - x);
+    s = 0.0f;
+    u = 1.0f;
+    do {
+      b -= 1.0f;
+      if (b == 0.0f) {
+        break;
+      }
+      a += 1.0f;
+      u *= t * b / a;
+      s += u;
+    } while (numext::abs(u) > machep);
+
+    return numext::exp(y) * (1.0f + s);
+  }
+};
+
+template <>
+struct betainc_impl<float> {
+  EIGEN_DEVICE_FUNC
+  static float run(float a, float b, float x) {
+    const float nan = NumTraits<float>::quiet_NaN();
+    float ans, t;
+
+    if (a <= 0.0f) return nan;
+    if (b <= 0.0f) return nan;
+    if ((x <= 0.0f) || (x >= 1.0f)) {
+      if (x == 0.0f) return 0.0f;
+      if (x == 1.0f) return 1.0f;
+      // mtherr("betaincf", DOMAIN);
+      return nan;
+    }
+
+    /* transformation for small aa */
+    if (a <= 1.0f) {
+      ans = betainc_helper<float>::incbsa(a + 1.0f, b, x);
+      t = a * numext::log(x) + b * numext::log1p(-x) +
+          lgamma_impl<float>::run(a + b) - lgamma_impl<float>::run(a + 1.0f) -
+          lgamma_impl<float>::run(b);
+      return (ans + numext::exp(t));
+    } else {
+      return betainc_helper<float>::incbsa(a, b, x);
+    }
+  }
+};
+
+template <>
+struct betainc_helper<double> {
+  EIGEN_DEVICE_FUNC
+  static EIGEN_STRONG_INLINE double incbps(double a, double b, double x) {
+    const double machep = cephes_helper<double>::machep();
+
+    double s, t, u, v, n, t1, z, ai;
+
+    ai = 1.0 / a;
+    u = (1.0 - b) * x;
+    v = u / (a + 1.0);
+    t1 = v;
+    t = u;
+    n = 2.0;
+    s = 0.0;
+    z = machep * ai;
+    while (numext::abs(v) > z) {
+      u = (n - b) * x / n;
+      t *= u;
+      v = t / (a + n);
+      s += v;
+      n += 1.0;
+    }
+    s += t1;
+    s += ai;
+
+    u = a * numext::log(x);
+    // TODO: gamma() is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(u) < maxlog) {
+      t = gamma(a + b) / (gamma(a) * gamma(b));
+      s = s * t * pow(x, a);
+    } else {
+    */
+    t = lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+        lgamma_impl<double>::run(b) + u + numext::log(s);
+    return s = numext::exp(t);
+  }
+};
+
+template <>
+struct betainc_impl<double> {
+  EIGEN_DEVICE_FUNC
+  static double run(double aa, double bb, double xx) {
+    const double nan = NumTraits<double>::quiet_NaN();
+    const double machep = cephes_helper<double>::machep();
+    // const double maxgam = 171.624376956302725;
+
+    double a, b, t, x, xc, w, y;
+    bool reversed_a_b = false;
+
+    if (aa <= 0.0 || bb <= 0.0) {
+      return nan;  // goto domerr;
+    }
+
+    if ((xx <= 0.0) || (xx >= 1.0)) {
+      if (xx == 0.0) return (0.0);
+      if (xx == 1.0) return (1.0);
+      // mtherr("incbet", DOMAIN);
+      return nan;
+    }
+
+    if ((bb * xx) <= 1.0 && xx <= 0.95) {
+      return betainc_helper<double>::incbps(aa, bb, xx);
+    }
+
+    w = 1.0 - xx;
+
+    /* Reverse a and b if x is greater than the mean. */
+    if (xx > (aa / (aa + bb))) {
+      reversed_a_b = true;
+      a = bb;
+      b = aa;
+      xc = xx;
+      x = w;
+    } else {
+      a = aa;
+      b = bb;
+      xc = w;
+      x = xx;
+    }
+
+    if (reversed_a_b && (b * x) <= 1.0 && x <= 0.95) {
+      t = betainc_helper<double>::incbps(a, b, x);
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+      return t;
+    }
+
+    /* Choose expansion for better convergence. */
+    y = x * (a + b - 2.0) - (a - 1.0);
+    if (y < 0.0) {
+      w = incbeta_cfe<double>::run(a, b, x, true /* small_branch */);
+    } else {
+      w = incbeta_cfe<double>::run(a, b, x, false /* small_branch */) / xc;
+    }
+
+    /* Multiply w by the factor
+         a      b   _             _     _
+        x  (1-x)   | (a+b) / ( a | (a) | (b) ) .   */
+
+    y = a * numext::log(x);
+    t = b * numext::log(xc);
+    // TODO: gamma is not directly implemented in Eigen.
+    /*
+    if ((a + b) < maxgam && numext::abs(y) < maxlog && numext::abs(t) < maxlog)
+    {
+      t = pow(xc, b);
+      t *= pow(x, a);
+      t /= a;
+      t *= w;
+      t *= gamma(a + b) / (gamma(a) * gamma(b));
+    } else {
+    */
+    /* Resort to logarithms.  */
+    y += t + lgamma_impl<double>::run(a + b) - lgamma_impl<double>::run(a) -
+         lgamma_impl<double>::run(b);
+    y += numext::log(w / a);
+    t = numext::exp(y);
+
+    /* } */
+    // done:
+
+    if (reversed_a_b) {
+      if (t <= machep) {
+        t = 1.0 - machep;
+      } else {
+        t = 1.0 - t;
+      }
+    }
+    return t;
+  }
+};
+
+#endif  // EIGEN_HAS_C99_MATH
+
+}  // end namespace internal
+
+namespace numext {
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(lgamma, Scalar)
+    lgamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(lgamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(digamma, Scalar)
+    digamma(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(digamma, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(zeta, Scalar)
+zeta(const Scalar& x, const Scalar& q) {
+    return EIGEN_MATHFUNC_IMPL(zeta, Scalar)::run(x, q);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(polygamma, Scalar)
+polygamma(const Scalar& n, const Scalar& x) {
+    return EIGEN_MATHFUNC_IMPL(polygamma, Scalar)::run(n, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erf, Scalar)
+    erf(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erf, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(erfc, Scalar)
+    erfc(const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(erfc, Scalar)::run(x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igamma, Scalar)
+    igamma(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igamma, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(igammac, Scalar)
+    igammac(const Scalar& a, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(igammac, Scalar)::run(a, x);
+}
+
+template <typename Scalar>
+EIGEN_DEVICE_FUNC inline EIGEN_MATHFUNC_RETVAL(betainc, Scalar)
+    betainc(const Scalar& a, const Scalar& b, const Scalar& x) {
+  return EIGEN_MATHFUNC_IMPL(betainc, Scalar)::run(a, b, x);
+}
+
+}  // end namespace numext
+
+
+}  // end namespace Eigen
+
+#endif  // EIGEN_SPECIAL_FUNCTIONS_H
diff --git a/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
new file mode 100644
index 000000000..46d60d323
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/SpecialFunctionsPacketMath.h
@@ -0,0 +1,58 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+#define EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
+namespace Eigen {
+
+namespace internal {
+
+/** \internal \returns the ln(|gamma(\a a)|) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet plgamma(const Packet& a) { using numext::lgamma; return lgamma(a); }
+
+/** \internal \returns the derivative of lgamma, psi(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pdigamma(const Packet& a) { using numext::digamma; return digamma(a); }
+
+/** \internal \returns the zeta function of two arguments (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet pzeta(const Packet& x, const Packet& q) { using numext::zeta; return zeta(x, q); }
+
+/** \internal \returns the polygamma function (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet ppolygamma(const Packet& n, const Packet& x) { using numext::polygamma; return polygamma(n, x); }
+
+/** \internal \returns the erf(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perf(const Packet& a) { using numext::erf; return erf(a); }
+
+/** \internal \returns the erfc(\a a) (coeff-wise) */
+template<typename Packet> EIGEN_DECLARE_FUNCTION_ALLOWING_MULTIPLE_DEFINITIONS
+Packet perfc(const Packet& a) { using numext::erfc; return erfc(a); }
+
+/** \internal \returns the incomplete gamma function igamma(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigamma(const Packet& a, const Packet& x) { using numext::igamma; return igamma(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function igammac(\a a, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pigammac(const Packet& a, const Packet& x) { using numext::igammac; return igammac(a, x); }
+
+/** \internal \returns the complementary incomplete gamma function betainc(\a a, \a b, \a x) */
+template<typename Packet> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+Packet pbetainc(const Packet& a, const Packet& b,const Packet& x) { using numext::betainc; return betainc(a, b, x); }
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_SPECIALFUNCTIONS_PACKETMATH_H
+
diff --git a/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
new file mode 100644
index 000000000..ec4fa8448
--- /dev/null
+++ b/unsupported/Eigen/src/SpecialFunctions/arch/CUDA/CudaSpecialFunctions.h
@@ -0,0 +1,165 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2014 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_CUDA_SPECIALFUNCTIONS_H
+#define EIGEN_CUDA_SPECIALFUNCTIONS_H
+
+namespace Eigen {
+
+namespace internal {
+
+// Make sure this is only available when targeting a GPU: we don't want to
+// introduce conflicts between these packet_traits definitions and the ones
+// we'll use on the host side (SSE, AVX, ...)
+#if defined(__CUDACC__) && defined(EIGEN_USE_GPU)
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 plgamma<float4>(const float4& a)
+{
+  return make_float4(lgammaf(a.x), lgammaf(a.y), lgammaf(a.z), lgammaf(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 plgamma<double2>(const double2& a)
+{
+  using numext::lgamma;
+  return make_double2(lgamma(a.x), lgamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pdigamma<float4>(const float4& a)
+{
+  using numext::digamma;
+  return make_float4(digamma(a.x), digamma(a.y), digamma(a.z), digamma(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pdigamma<double2>(const double2& a)
+{
+  using numext::digamma;
+  return make_double2(digamma(a.x), digamma(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pzeta<float4>(const float4& x, const float4& q)
+{
+    using numext::zeta;
+    return make_float4(zeta(x.x, q.x), zeta(x.y, q.y), zeta(x.z, q.z), zeta(x.w, q.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pzeta<double2>(const double2& x, const double2& q)
+{
+    using numext::zeta;
+    return make_double2(zeta(x.x, q.x), zeta(x.y, q.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 ppolygamma<float4>(const float4& n, const float4& x)
+{
+    using numext::polygamma;
+    return make_float4(polygamma(n.x, x.x), polygamma(n.y, x.y), polygamma(n.z, x.z), polygamma(n.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 ppolygamma<double2>(const double2& n, const double2& x)
+{
+    using numext::polygamma;
+    return make_double2(polygamma(n.x, x.x), polygamma(n.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perf<float4>(const float4& a)
+{
+  return make_float4(erff(a.x), erff(a.y), erff(a.z), erff(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perf<double2>(const double2& a)
+{
+  using numext::erf;
+  return make_double2(erf(a.x), erf(a.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 perfc<float4>(const float4& a)
+{
+  using numext::erfc;
+  return make_float4(erfc(a.x), erfc(a.y), erfc(a.z), erfc(a.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 perfc<double2>(const double2& a)
+{
+  using numext::erfc;
+  return make_double2(erfc(a.x), erfc(a.y));
+}
+
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigamma<float4>(const float4& a, const float4& x)
+{
+  using numext::igamma;
+  return make_float4(
+      igamma(a.x, x.x),
+      igamma(a.y, x.y),
+      igamma(a.z, x.z),
+      igamma(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigamma<double2>(const double2& a, const double2& x)
+{
+  using numext::igamma;
+  return make_double2(igamma(a.x, x.x), igamma(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pigammac<float4>(const float4& a, const float4& x)
+{
+  using numext::igammac;
+  return make_float4(
+      igammac(a.x, x.x),
+      igammac(a.y, x.y),
+      igammac(a.z, x.z),
+      igammac(a.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pigammac<double2>(const double2& a, const double2& x)
+{
+  using numext::igammac;
+  return make_double2(igammac(a.x, x.x), igammac(a.y, x.y));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+float4 pbetainc<float4>(const float4& a, const float4& b, const float4& x)
+{
+  using numext::betainc;
+  return make_float4(
+      betainc(a.x, b.x, x.x),
+      betainc(a.y, b.y, x.y),
+      betainc(a.z, b.z, x.z),
+      betainc(a.w, b.w, x.w));
+}
+
+template<> EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
+double2 pbetainc<double2>(const double2& a, const double2& b, const double2& x)
+{
+  using numext::betainc;
+  return make_double2(betainc(a.x, b.x, x.x), betainc(a.y, b.y, x.y));
+}
+
+#endif
+
+} // end namespace internal
+
+} // end namespace Eigen
+
+#endif // EIGEN_CUDA_SPECIALFUNCTIONS_H
diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt
deleted file mode 100644
index 55c6271e9..000000000
--- a/unsupported/Eigen/src/Splines/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Splines_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Splines_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Splines/Spline.h b/unsupported/Eigen/src/Splines/Spline.h
index ddcddfc9a..627f6e482 100644
--- a/unsupported/Eigen/src/Splines/Spline.h
+++ b/unsupported/Eigen/src/Splines/Spline.h
@@ -94,7 +94,7 @@ namespace Eigen
     const KnotVectorType& knots() const { return m_knots; }
     
     /**
-     * \brief Returns the knots of the underlying spline.
+     * \brief Returns the ctrls of the underlying spline.
      **/    
     const ControlPointVectorType& ctrls() const { return m_ctrls; }
 
diff --git a/unsupported/doc/examples/BVH_Example.cpp b/unsupported/doc/examples/BVH_Example.cpp
index 6b6fac075..afb0c94c2 100644
--- a/unsupported/doc/examples/BVH_Example.cpp
+++ b/unsupported/doc/examples/BVH_Example.cpp
@@ -6,9 +6,7 @@ using namespace Eigen;
 typedef AlignedBox<double, 2> Box2d;
 
 namespace Eigen {
-    namespace internal {
-        Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
-    }
+  Box2d bounding_box(const Vector2d &v) { return Box2d(v, v); } //compute the bounding box of a single point
 }
 
 struct PointPointMinimizer //how to compute squared distances between points and rectangles
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index fab140871..c0b321617 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -111,10 +111,14 @@ ei_add_test(gmres)
 ei_add_test(minres)
 ei_add_test(levenberg_marquardt)
 ei_add_test(kronecker_product)
+ei_add_test(special_functions)
 
 # TODO: The following test names are prefixed with the cxx11 string, since historically
 # the tests depended on c++11. This isn't the case anymore so we ought to rename them.
-ei_add_test(cxx11_float16)
+# FIXME: Old versions of MSVC fail to compile this code, so we just disable these tests
+# when using visual studio. We should make the check more strict to enable the tests for
+# newer versions of MSVC.
+if (NOT CMAKE_CXX_COMPILER_ID STREQUAL "MSVC")
 ei_add_test(cxx11_tensor_dimension)
 ei_add_test(cxx11_tensor_map)
 ei_add_test(cxx11_tensor_assign)
@@ -132,7 +136,8 @@ ei_add_test(cxx11_tensor_io)
 if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
   # This test requires __uint128_t which is only available on 64bit systems 
   ei_add_test(cxx11_tensor_uint128)
-endif() 
+endif()
+endif()
 
 if(EIGEN_TEST_CXX11)
   # It should be safe to always run these tests as there is some fallback code for
@@ -188,10 +193,12 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   # Make sure to compile without the -pedantic, -Wundef, -Wnon-virtual-dtor
   # and -fno-check-new flags since they trigger thousands of compilation warnings
   # in the CUDA runtime
+  # Also remove -ansi that is incompatible with std=c++11.
   string(REPLACE "-pedantic" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-Wundef" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-Wnon-virtual-dtor" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
   string(REPLACE "-fno-check-new" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+  string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
 
   message(STATUS "Flags used to compile cuda code: " ${CMAKE_CXX_FLAGS})
 
@@ -207,7 +214,14 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
     set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
   endif()
 
-  set(CUDA_NVCC_FLAGS "-std=c++11 ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\"")
+  if( (NOT EIGEN_TEST_CXX11) OR (CMAKE_VERSION VERSION_LESS 3.3))
+    set(EIGEN_CUDA_CXX11_FLAG "-std=c++11")
+  else()
+    # otherwise the flag has already been added because of the above set(CMAKE_CXX_STANDARD 11)
+    set(EIGEN_CUDA_CXX11_FLAG "")
+  endif()
+
+  set(CUDA_NVCC_FLAGS  "${EIGEN_CUDA_CXX11_FLAG} ${EIGEN_CUDA_RELAXED_CONSTEXPR} -arch compute_${EIGEN_CUDA_COMPUTE_ARCH} -Xcudafe \"--display_error_number\" ${CUDA_NVCC_FLAGS}")
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
@@ -217,6 +231,7 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   ei_add_test(cxx11_tensor_reduction_cuda)
   ei_add_test(cxx11_tensor_argmax_cuda)
   ei_add_test(cxx11_tensor_cast_float16_cuda)
+  ei_add_test(cxx11_tensor_scan_cuda)
 
   # The random number generation code requires arch 3.5 or greater.
   if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
diff --git a/unsupported/test/FFTW.cpp b/unsupported/test/FFTW.cpp
index 1dd6dc97d..8b7528fb7 100644
--- a/unsupported/test/FFTW.cpp
+++ b/unsupported/test/FFTW.cpp
@@ -18,11 +18,11 @@ using namespace Eigen;
 
 
 template < typename T>
-complex<long double>  promote(complex<T> x) { return complex<long double>(x.real(),x.imag()); }
+complex<long double>  promote(complex<T> x) { return complex<long double>((long double)x.real(),(long double)x.imag()); }
 
-complex<long double>  promote(float x) { return complex<long double>( x); }
-complex<long double>  promote(double x) { return complex<long double>( x); }
-complex<long double>  promote(long double x) { return complex<long double>( x); }
+complex<long double>  promote(float x) { return complex<long double>((long double)x); }
+complex<long double>  promote(double x) { return complex<long double>((long double)x); }
+complex<long double>  promote(long double x) { return complex<long double>((long double)x); }
     
 
     template <typename VT1,typename VT2>
@@ -33,7 +33,7 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         long double pi = acos((long double)-1 );
         for (size_t k0=0;k0<(size_t)fftbuf.size();++k0) {
             complex<long double> acc = 0;
-            long double phinc = -2.*k0* pi / timebuf.size();
+            long double phinc = (long double)(-2.)*k0* pi / timebuf.size();
             for (size_t k1=0;k1<(size_t)timebuf.size();++k1) {
                 acc +=  promote( timebuf[k1] ) * exp( complex<long double>(0,k1*phinc) );
             }
@@ -54,8 +54,8 @@ complex<long double>  promote(long double x) { return complex<long double>( x);
         long double difpower=0;
         size_t n = (min)( buf1.size(),buf2.size() );
         for (size_t k=0;k<n;++k) {
-            totalpower += (numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2;
-            difpower += numext::abs2(buf1[k] - buf2[k]);
+            totalpower += (long double)((numext::abs2( buf1[k] ) + numext::abs2(buf2[k]) )/2);
+            difpower += (long double)(numext::abs2(buf1[k] - buf2[k]));
         }
         return sqrt(difpower/totalpower);
     }
@@ -93,19 +93,19 @@ void test_scalar_generic(int nfft)
     fft.SetFlag(fft.HalfSpectrum );
     fft.fwd( freqBuf,tbuf);
     VERIFY((size_t)freqBuf.size() == (size_t)( (nfft>>1)+1) );
-    VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
 
     fft.ClearFlag(fft.HalfSpectrum );
     fft.fwd( freqBuf,tbuf);
     VERIFY( (size_t)freqBuf.size() == (size_t)nfft);
-    VERIFY( fft_rmse(freqBuf,tbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(freqBuf,tbuf)) < test_precision<T>()  );// gross check
 
     if (nfft&1)
         return; // odd FFTs get the wrong size inverse FFT
 
     ScalarVector tbuf2;
     fft.inv( tbuf2 , freqBuf);
-    VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
 
 
     // verify that the Unscaled flag takes effect
@@ -121,12 +121,12 @@ void test_scalar_generic(int nfft)
     //for (size_t i=0;i<(size_t) tbuf.size();++i)
     //    cout << "freqBuf=" << freqBuf[i] << " in2=" << tbuf3[i] << " -  in=" << tbuf[i] << " => " << (tbuf3[i] - tbuf[i] ) <<  endl;
 
-    VERIFY( dif_rmse(tbuf,tbuf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf3)) < test_precision<T>()  );// gross check
 
     // verify that ClearFlag works
     fft.ClearFlag(fft.Unscaled);
     fft.inv( tbuf2 , freqBuf);
-    VERIFY( dif_rmse(tbuf,tbuf2) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(tbuf,tbuf2)) < test_precision<T>()  );// gross check
 }
 
 template <typename T>
@@ -152,10 +152,10 @@ void test_complex_generic(int nfft)
         inbuf[k]= Complex( (T)(rand()/(double)RAND_MAX - .5), (T)(rand()/(double)RAND_MAX - .5) );
     fft.fwd( outbuf , inbuf);
 
-    VERIFY( fft_rmse(outbuf,inbuf) < test_precision<T>()  );// gross check
+    VERIFY( T(fft_rmse(outbuf,inbuf)) < test_precision<T>()  );// gross check
     fft.inv( buf3 , outbuf);
 
-    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
 
     // verify that the Unscaled flag takes effect
     ComplexVector buf4;
@@ -163,12 +163,12 @@ void test_complex_generic(int nfft)
     fft.inv( buf4 , outbuf);
     for (int k=0;k<nfft;++k)
         buf4[k] *= T(1./nfft);
-    VERIFY( dif_rmse(inbuf,buf4) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf4)) < test_precision<T>()  );// gross check
 
     // verify that ClearFlag works
     fft.ClearFlag(fft.Unscaled);
     fft.inv( buf3 , outbuf);
-    VERIFY( dif_rmse(inbuf,buf3) < test_precision<T>()  );// gross check
+    VERIFY( T(dif_rmse(inbuf,buf3)) < test_precision<T>()  );// gross check
 }
 
 template <typename T>
diff --git a/unsupported/test/autodiff.cpp b/unsupported/test/autodiff.cpp
index b59fd1c43..2da6dd8f3 100644
--- a/unsupported/test/autodiff.cpp
+++ b/unsupported/test/autodiff.cpp
@@ -205,6 +205,10 @@ void test_autodiff_hessian()
   VERIFY_IS_APPROX(y.value().derivatives()(1), s4*std::cos(s1*s3+s2*s4));
   VERIFY_IS_APPROX(y.derivatives()(0).derivatives(), -std::sin(s1*s3+s2*s4)*Vector2d(s3*s3,s4*s3));
   VERIFY_IS_APPROX(y.derivatives()(1).derivatives(),  -std::sin(s1*s3+s2*s4)*Vector2d(s3*s4,s4*s4));
+
+  ADD z = x(0)*x(1);
+  VERIFY_IS_APPROX(z.derivatives()(0).derivatives(), Vector2d(0,1));
+  VERIFY_IS_APPROX(z.derivatives()(1).derivatives(), Vector2d(1,0));
 }
 
 double bug_1222() {
@@ -234,6 +238,32 @@ double bug_1223() {
   return t.value() + t2.value();
 }
 
+// regression test for some compilation issues with specializations of ScalarBinaryOpTraits
+void bug_1260() {
+  Matrix4d A;
+  Vector4d v;
+  A*v;
+}
+
+// check a compilation issue with numext::max
+double bug_1261() {
+  typedef AutoDiffScalar<Matrix2d> AD;
+  typedef Matrix<AD,2,1> VectorAD;
+
+  VectorAD v;
+  const AD maxVal = v.maxCoeff();
+  const AD minVal = v.minCoeff();
+  return maxVal.value() + minVal.value();
+}
+
+double bug_1264() {
+  typedef AutoDiffScalar<Vector2d> AD;
+  const AD s;
+  const Matrix<AD, 3, 1> v1;
+  const Matrix<AD, 3, 1> v2 = (s + 3.0) * v1;
+  return v2(0).value();
+}
+
 void test_autodiff()
 {
   for(int i = 0; i < g_repeat; i++) {
@@ -245,5 +275,7 @@ void test_autodiff()
 
   bug_1222();
   bug_1223();
+  bug_1260();
+  bug_1261();
 }
 
diff --git a/unsupported/test/autodiff_scalar.cpp b/unsupported/test/autodiff_scalar.cpp
index c631c734a..4df2f5c57 100644
--- a/unsupported/test/autodiff_scalar.cpp
+++ b/unsupported/test/autodiff_scalar.cpp
@@ -36,13 +36,48 @@ template<typename Scalar> void check_atan2()
   VERIFY_IS_APPROX(res.derivatives(), x.derivatives());
 }
 
+template<typename Scalar> void check_hyperbolic_functions()
+{
+  using std::sinh;
+  using std::cosh;
+  using std::tanh;
+  typedef Matrix<Scalar, 1, 1> Deriv1;
+  typedef AutoDiffScalar<Deriv1> AD;
+  Deriv1 p = Deriv1::Random();
+  AD val(p.x(),Deriv1::UnitX());
+
+  Scalar cosh_px = std::cosh(p.x());
+  AD res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.value(), std::tanh(p.x()));
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(1.0) / (cosh_px * cosh_px));
 
+  AD res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.value(), std::sinh(p.x()));
+  VERIFY_IS_APPROX(res2.derivatives().x(), cosh_px);
 
+  AD res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.value(), cosh_px);
+  VERIFY_IS_APPROX(res3.derivatives().x(), std::sinh(p.x()));
+
+  // Check constant values.
+  const Scalar sample_point = Scalar(1) / Scalar(3); 
+  val = AD(sample_point,Deriv1::UnitX());
+  res1 = tanh(val);
+  VERIFY_IS_APPROX(res1.derivatives().x(), Scalar(0.896629559604914));
+
+  res2 = sinh(val);
+  VERIFY_IS_APPROX(res2.derivatives().x(), Scalar(1.056071867829939));
+
+  res3 = cosh(val);
+  VERIFY_IS_APPROX(res3.derivatives().x(), Scalar(0.339540557256150));
+}
 
 void test_autodiff_scalar()
 {
   for(int i = 0; i < g_repeat; i++) {
     CALL_SUBTEST_1( check_atan2<float>() );
     CALL_SUBTEST_2( check_atan2<double>() );
+    CALL_SUBTEST_3( check_hyperbolic_functions<float>() );
+    CALL_SUBTEST_4( check_hyperbolic_functions<double>() );
   }
 }
diff --git a/unsupported/test/cxx11_float16.cpp b/unsupported/test/cxx11_float16.cpp
deleted file mode 100644
index e39a7f83c..000000000
--- a/unsupported/test/cxx11_float16.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-// This file is part of Eigen, a lightweight C++ template library
-// for linear algebra.
-//
-// This Source Code Form is subject to the terms of the Mozilla
-// Public License v. 2.0. If a copy of the MPL was not distributed
-// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
-
-#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_float16
-
-#include "main.h"
-#include <Eigen/src/Core/arch/CUDA/Half.h>
-
-using Eigen::half;
-
-void test_conversion()
-{
-  // Conversion from float.
-  VERIFY_IS_EQUAL(half(1.0f).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(0.5f).x, 0x3800);
-  VERIFY_IS_EQUAL(half(0.33333f).x, 0x3555);
-  VERIFY_IS_EQUAL(half(0.0f).x, 0x0000);
-  VERIFY_IS_EQUAL(half(-0.0f).x, 0x8000);
-  VERIFY_IS_EQUAL(half(65504.0f).x, 0x7bff);
-  VERIFY_IS_EQUAL(half(65536.0f).x, 0x7c00);  // Becomes infinity.
-
-  // Denormals.
-  VERIFY_IS_EQUAL(half(-5.96046e-08f).x, 0x8001);
-  VERIFY_IS_EQUAL(half(5.96046e-08f).x, 0x0001);
-  VERIFY_IS_EQUAL(half(1.19209e-07f).x, 0x0002);
-
-  // Verify round-to-nearest-even behavior.
-  float val1 = float(half(__half(0x3c00)));
-  float val2 = float(half(__half(0x3c01)));
-  float val3 = float(half(__half(0x3c02)));
-  VERIFY_IS_EQUAL(half(0.5f * (val1 + val2)).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(0.5f * (val2 + val3)).x, 0x3c02);
-
-  // Conversion from int.
-  VERIFY_IS_EQUAL(half(-1).x, 0xbc00);
-  VERIFY_IS_EQUAL(half(0).x, 0x0000);
-  VERIFY_IS_EQUAL(half(1).x, 0x3c00);
-  VERIFY_IS_EQUAL(half(2).x, 0x4000);
-  VERIFY_IS_EQUAL(half(3).x, 0x4200);
-
-  // Conversion from bool.
-  VERIFY_IS_EQUAL(half(false).x, 0x0000);
-  VERIFY_IS_EQUAL(half(true).x, 0x3c00);
-
-  // Conversion to float.
-  VERIFY_IS_EQUAL(float(half(__half(0x0000))), 0.0f);
-  VERIFY_IS_EQUAL(float(half(__half(0x3c00))), 1.0f);
-
-  // Denormals.
-  VERIFY_IS_APPROX(float(half(__half(0x8001))), -5.96046e-08f);
-  VERIFY_IS_APPROX(float(half(__half(0x0001))), 5.96046e-08f);
-  VERIFY_IS_APPROX(float(half(__half(0x0002))), 1.19209e-07f);
-
-  // NaNs and infinities.
-  VERIFY(!(numext::isinf)(float(half(65504.0f))));  // Largest finite number.
-  VERIFY(!(numext::isnan)(float(half(0.0f))));
-  VERIFY((numext::isinf)(float(half(__half(0xfc00)))));
-  VERIFY((numext::isnan)(float(half(__half(0xfc01)))));
-  VERIFY((numext::isinf)(float(half(__half(0x7c00)))));
-  VERIFY((numext::isnan)(float(half(__half(0x7c01)))));
-
-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY((numext::isnan)(float(half(0.0 / 0.0))));
-  VERIFY((numext::isinf)(float(half(1.0 / 0.0))));
-  VERIFY((numext::isinf)(float(half(-1.0 / 0.0))));
-#endif
-
-  // Exactly same checks as above, just directly on the half representation.
-  VERIFY(!(numext::isinf)(half(__half(0x7bff))));
-  VERIFY(!(numext::isnan)(half(__half(0x0000))));
-  VERIFY((numext::isinf)(half(__half(0xfc00))));
-  VERIFY((numext::isnan)(half(__half(0xfc01))));
-  VERIFY((numext::isinf)(half(__half(0x7c00))));
-  VERIFY((numext::isnan)(half(__half(0x7c01))));
-
-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY((numext::isnan)(half(0.0 / 0.0)));
-  VERIFY((numext::isinf)(half(1.0 / 0.0)));
-  VERIFY((numext::isinf)(half(-1.0 / 0.0)));
-#endif
-}
-
-void test_numtraits()
-{
-  std::cout << "expsilin = " << NumTraits<half>::epsilon() << std::endl;
-  std::cout << "highest = " << NumTraits<half>::highest() << std::endl;
-  std::cout << "lowest = " << NumTraits<half>::lowest() << std::endl;
-  std::cout << "inifinty = " << NumTraits<half>::infinity() << std::endl;
-  std::cout << "nan = " << NumTraits<half>::quiet_NaN() << std::endl;
-
-}
-
-void test_arithmetic()
-{
-  VERIFY_IS_EQUAL(float(half(2) + half(2)), 4);
-  VERIFY_IS_EQUAL(float(half(2) + half(-2)), 0);
-  VERIFY_IS_APPROX(float(half(0.33333f) + half(0.66667f)), 1.0f);
-  VERIFY_IS_EQUAL(float(half(2.0f) * half(-5.5f)), -11.0f);
-  VERIFY_IS_APPROX(float(half(1.0f) / half(3.0f)), 0.33333f);
-  VERIFY_IS_EQUAL(float(-half(4096.0f)), -4096.0f);
-  VERIFY_IS_EQUAL(float(-half(-4096.0f)), 4096.0f);
-}
-
-void test_comparison()
-{
-  VERIFY(half(1.0f) > half(0.5f));
-  VERIFY(half(0.5f) < half(1.0f));
-  VERIFY(!(half(1.0f) < half(0.5f)));
-  VERIFY(!(half(0.5f) > half(1.0f)));
-
-  VERIFY(!(half(4.0f) > half(4.0f)));
-  VERIFY(!(half(4.0f) < half(4.0f)));
-
-  VERIFY(!(half(0.0f) < half(-0.0f)));
-  VERIFY(!(half(-0.0f) < half(0.0f)));
-  VERIFY(!(half(0.0f) > half(-0.0f)));
-  VERIFY(!(half(-0.0f) > half(0.0f)));
-
-  VERIFY(half(0.2f) > half(-1.0f));
-  VERIFY(half(-1.0f) < half(0.2f));
-  VERIFY(half(-16.0f) < half(-15.0f));
-
-  VERIFY(half(1.0f) == half(1.0f));
-  VERIFY(half(1.0f) != half(2.0f));
-
-  // Comparisons with NaNs and infinities.
-#if !EIGEN_COMP_MSVC
-  // Visual Studio errors out on divisions by 0
-  VERIFY(!(half(0.0 / 0.0) == half(0.0 / 0.0)));
-  VERIFY(half(0.0 / 0.0) != half(0.0 / 0.0));
-
-  VERIFY(!(half(1.0) == half(0.0 / 0.0)));
-  VERIFY(!(half(1.0) < half(0.0 / 0.0)));
-  VERIFY(!(half(1.0) > half(0.0 / 0.0)));
-  VERIFY(half(1.0) != half(0.0 / 0.0));
-
-  VERIFY(half(1.0) < half(1.0 / 0.0));
-  VERIFY(half(1.0) > half(-1.0 / 0.0));
-#endif
-}
-
-void test_basic_functions()
-{
-  VERIFY_IS_EQUAL(float(numext::abs(half(3.5f))), 3.5f);
-  VERIFY_IS_EQUAL(float(numext::abs(half(-3.5f))), 3.5f);
-
-  VERIFY_IS_EQUAL(float(numext::floor(half(3.5f))), 3.0f);
-  VERIFY_IS_EQUAL(float(numext::floor(half(-3.5f))), -4.0f);
-
-  VERIFY_IS_EQUAL(float(numext::ceil(half(3.5f))), 4.0f);
-  VERIFY_IS_EQUAL(float(numext::ceil(half(-3.5f))), -3.0f);
-
-  VERIFY_IS_APPROX(float(numext::sqrt(half(0.0f))), 0.0f);
-  VERIFY_IS_APPROX(float(numext::sqrt(half(4.0f))), 2.0f);
-
-  VERIFY_IS_APPROX(float(numext::pow(half(0.0f), half(1.0f))), 0.0f);
-  VERIFY_IS_APPROX(float(numext::pow(half(2.0f), half(2.0f))), 4.0f);
-
-  VERIFY_IS_EQUAL(float(numext::exp(half(0.0f))), 1.0f);
-  VERIFY_IS_APPROX(float(numext::exp(half(EIGEN_PI))), float(20.0 + EIGEN_PI));
-
-  VERIFY_IS_EQUAL(float(numext::log(half(1.0f))), 0.0f);
-  VERIFY_IS_APPROX(float(numext::log(half(10.0f))), 2.30273f);
-}
-
-void test_trigonometric_functions()
-{
-  VERIFY_IS_APPROX(numext::cos(half(0.0f)), half(cosf(0.0f)));
-  VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI)), half(cosf(EIGEN_PI)));
-  //VERIFY_IS_APPROX(numext::cos(half(EIGEN_PI/2)), half(cosf(EIGEN_PI/2)));
-  //VERIFY_IS_APPROX(numext::cos(half(3*EIGEN_PI/2)), half(cosf(3*EIGEN_PI/2)));
-  VERIFY_IS_APPROX(numext::cos(half(3.5f)), half(cosf(3.5f)));
-
-  VERIFY_IS_APPROX(numext::sin(half(0.0f)), half(sinf(0.0f)));
-  //  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI)), half(sinf(EIGEN_PI)));
-  VERIFY_IS_APPROX(numext::sin(half(EIGEN_PI/2)), half(sinf(EIGEN_PI/2)));
-  VERIFY_IS_APPROX(numext::sin(half(3*EIGEN_PI/2)), half(sinf(3*EIGEN_PI/2)));
-  VERIFY_IS_APPROX(numext::sin(half(3.5f)), half(sinf(3.5f)));
-
-  VERIFY_IS_APPROX(numext::tan(half(0.0f)), half(tanf(0.0f)));
-  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI)), half(tanf(EIGEN_PI)));
-  //  VERIFY_IS_APPROX(numext::tan(half(EIGEN_PI/2)), half(tanf(EIGEN_PI/2)));
-  //VERIFY_IS_APPROX(numext::tan(half(3*EIGEN_PI/2)), half(tanf(3*EIGEN_PI/2)));
-  VERIFY_IS_APPROX(numext::tan(half(3.5f)), half(tanf(3.5f)));
-}
-
-void test_cxx11_float16()
-{
-  CALL_SUBTEST(test_conversion());
-  CALL_SUBTEST(test_numtraits());
-  CALL_SUBTEST(test_arithmetic());
-  CALL_SUBTEST(test_comparison());
-  CALL_SUBTEST(test_basic_functions());
-  CALL_SUBTEST(test_trigonometric_functions());
-}
diff --git a/unsupported/test/cxx11_non_blocking_thread_pool.cpp b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
index 6569218c4..5f9bb938b 100644
--- a/unsupported/test/cxx11_non_blocking_thread_pool.cpp
+++ b/unsupported/test/cxx11_non_blocking_thread_pool.cpp
@@ -27,6 +27,8 @@ static void test_parallelism()
   // Test we never-ever fail to match available tasks with idle threads.
   const int kThreads = 16;  // code below expects that this is a multiple of 4
   NonBlockingThreadPool tp(kThreads);
+  VERIFY_IS_EQUAL(tp.NumThreads(), kThreads);
+  VERIFY_IS_EQUAL(tp.CurrentThreadId(), -1);
   for (int iter = 0; iter < 100; ++iter) {
     std::atomic<int> running(0);
     std::atomic<int> done(0);
@@ -34,6 +36,9 @@ static void test_parallelism()
     // Schedule kThreads tasks and ensure that they all are running.
     for (int i = 0; i < kThreads; ++i) {
       tp.Schedule([&]() {
+        const int thread_id = tp.CurrentThreadId();
+        VERIFY_GE(thread_id, 0);
+        VERIFY_LE(thread_id, kThreads - 1);
         running++;
         while (phase < 1) {
         }
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 4026f48f0..284b46803 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -1019,6 +1019,153 @@ void test_cuda_erfc(const Scalar stddev)
   cudaFree(d_out);
 }
 
+template <typename Scalar>
+void test_cuda_betainc()
+{
+  Tensor<Scalar, 1> in_x(125);
+  Tensor<Scalar, 1> in_a(125);
+  Tensor<Scalar, 1> in_b(125);
+  Tensor<Scalar, 1> out(125);
+  Tensor<Scalar, 1> expected_out(125);
+  out.setZero();
+
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Array<Scalar, 1, Dynamic> x(125);
+  Array<Scalar, 1, Dynamic> a(125);
+  Array<Scalar, 1, Dynamic> b(125);
+  Array<Scalar, 1, Dynamic> v(125);
+
+  a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+      0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+      999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999;
+
+  b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 0.0, 0.0,
+      0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+      0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+      0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+      31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999, 999.999,
+      999.999, 999.999, 999.999;
+
+  x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+      0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+      -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+      1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+      0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+      0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1;
+
+  v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+      nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+      0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+      0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+      0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan, nan,
+      nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+      0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+      0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+      0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+      0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+      1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06, nan,
+      nan, 7.864342668429763e-23, 3.015969667594166e-10, 0.0008598571564165444,
+      nan, nan, 6.031987710123844e-08, 0.5000000000000007, 0.9999999396801229,
+      nan, nan, 0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan,
+      nan, nan, nan, nan, nan, nan, 0.0, 7.029920380986636e-306,
+      2.2450728208591345e-101, nan, nan, 0.0, 9.275871147869727e-302,
+      1.2232913026152827e-97, nan, nan, 0.0, 3.0891393081932924e-252,
+      2.9303043666183996e-60, nan, nan, 2.248913486879199e-196,
+      0.5000000000004947, 0.9999999999999999, nan;
+
+  for (int i = 0; i < 125; ++i) {
+    in_x(i) = x(i);
+    in_a(i) = a(i);
+    in_b(i) = b(i);
+    expected_out(i) = v(i);
+  }
+
+  std::size_t bytes = in_x.size() * sizeof(Scalar);
+
+  Scalar* d_in_x;
+  Scalar* d_in_a;
+  Scalar* d_in_b;
+  Scalar* d_out;
+  cudaMalloc((void**)(&d_in_x), bytes);
+  cudaMalloc((void**)(&d_in_a), bytes);
+  cudaMalloc((void**)(&d_in_b), bytes);
+  cudaMalloc((void**)(&d_out), bytes);
+
+  cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_a(d_in_a, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_b(d_in_b, 125);
+  Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_out(d_out, 125);
+
+  gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x);
+
+  assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 1; i < 125; ++i) {
+    if ((std::isnan)(expected_out(i))) {
+      VERIFY((std::isnan)(out(i)));
+    } else {
+      VERIFY_IS_APPROX(out(i), expected_out(i));
+    }
+  }
+
+  cudaFree(d_in_x);
+  cudaFree(d_in_a);
+  cudaFree(d_in_b);
+  cudaFree(d_out);
+}
+
+
 void test_cxx11_tensor_cuda()
 {
   CALL_SUBTEST_1(test_cuda_elementwise_small());
@@ -1086,5 +1233,8 @@ void test_cxx11_tensor_cuda()
 
   CALL_SUBTEST_5(test_cuda_igamma<double>());
   CALL_SUBTEST_5(test_cuda_igammac<double>());
+
+  CALL_SUBTEST_6(test_cuda_betainc<float>());
+  CALL_SUBTEST_6(test_cuda_betainc<double>());
 #endif
 }
diff --git a/unsupported/test/cxx11_tensor_dimension.cpp b/unsupported/test/cxx11_tensor_dimension.cpp
index 0bccc3396..16f168ed4 100644
--- a/unsupported/test/cxx11_tensor_dimension.cpp
+++ b/unsupported/test/cxx11_tensor_dimension.cpp
@@ -21,7 +21,7 @@ static void test_dynamic_size()
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
-  VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
   VERIFY_IS_EQUAL((int)dimensions[0], 2);
   VERIFY_IS_EQUAL((int)dimensions[1], 3);
   VERIFY_IS_EQUAL((int)dimensions[2], 7);
@@ -34,12 +34,12 @@ static void test_fixed_size()
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<0>(dimensions), 2);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<1>(dimensions), 3);
   VERIFY_IS_EQUAL((int)Eigen::internal::array_get<2>(dimensions), 7);
-  VERIFY_IS_EQUAL(dimensions.TotalSize(), 2*3*7);
+  VERIFY_IS_EQUAL((int)dimensions.TotalSize(), 2*3*7);
 }
 
 static void test_match()
 {
-  Eigen::DSizes<int, 3> dyn(2,3,7);
+  Eigen::DSizes<unsigned int, 3> dyn((unsigned int)2,(unsigned int)3,(unsigned int)7);
   Eigen::Sizes<2,3,7> stat;
   VERIFY_IS_EQUAL(Eigen::dimensions_match(dyn, stat), true);
 
@@ -51,13 +51,13 @@ static void test_match()
 static void test_rank_zero()
 {
   Eigen::Sizes<> scalar;
-  VERIFY_IS_EQUAL(scalar.TotalSize(), 1);
-  VERIFY_IS_EQUAL(scalar.rank(), 0);
-  VERIFY_IS_EQUAL(internal::array_prod(scalar), 1);
+  VERIFY_IS_EQUAL((int)scalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)scalar.rank(), 0);
+  VERIFY_IS_EQUAL((int)internal::array_prod(scalar), 1);
 
   Eigen::DSizes<ptrdiff_t, 0> dscalar;
-  VERIFY_IS_EQUAL(dscalar.TotalSize(), 1);
-  VERIFY_IS_EQUAL(dscalar.rank(), 0u);
+  VERIFY_IS_EQUAL((int)dscalar.TotalSize(), 1);
+  VERIFY_IS_EQUAL((int)dscalar.rank(), 0);
 }
 
 void test_cxx11_tensor_dimension()
diff --git a/unsupported/test/cxx11_tensor_io.cpp b/unsupported/test/cxx11_tensor_io.cpp
index 8bbcf7089..489960529 100644
--- a/unsupported/test/cxx11_tensor_io.cpp
+++ b/unsupported/test/cxx11_tensor_io.cpp
@@ -14,6 +14,20 @@
 
 
 template<int DataLayout>
+static void test_output_0d()
+{
+  Tensor<int, 0, DataLayout> tensor;
+  tensor() = 123;
+
+  std::stringstream os;
+  os << tensor;
+
+  std::string expected("123");
+  VERIFY_IS_EQUAL(std::string(os.str()), expected);
+}
+
+
+template<int DataLayout>
 static void test_output_1d()
 {
   Tensor<int, 1, DataLayout> tensor(5);
@@ -26,6 +40,12 @@ static void test_output_1d()
 
   std::string expected("0\n1\n2\n3\n4");
   VERIFY_IS_EQUAL(std::string(os.str()), expected);
+
+  Eigen::Tensor<double,1,DataLayout> empty_tensor(0);
+  std::stringstream empty_os;
+  empty_os << empty_tensor;
+  std::string empty_string;
+  VERIFY_IS_EQUAL(std::string(empty_os.str()), empty_string);
 }
 
 
@@ -101,6 +121,8 @@ static void test_output_const()
 
 void test_cxx11_tensor_io()
 {
+  CALL_SUBTEST(test_output_0d<ColMajor>());
+  CALL_SUBTEST(test_output_0d<RowMajor>());
   CALL_SUBTEST(test_output_1d<ColMajor>());
   CALL_SUBTEST(test_output_1d<RowMajor>());
   CALL_SUBTEST(test_output_2d<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_morphing.cpp b/unsupported/test/cxx11_tensor_morphing.cpp
index c575d3fdc..f7de43110 100644
--- a/unsupported/test/cxx11_tensor_morphing.cpp
+++ b/unsupported/test/cxx11_tensor_morphing.cpp
@@ -13,6 +13,7 @@
 
 using Eigen::Tensor;
 
+template<typename>
 static void test_simple_reshape()
 {
   Tensor<float, 5> tensor1(2,3,1,7,1);
@@ -40,7 +41,7 @@ static void test_simple_reshape()
   }
 }
 
-
+template<typename>
 static void test_reshape_in_expr() {
   MatrixXf m1(2,3*5*7*11);
   MatrixXf m2(3*5*7*11,13);
@@ -65,7 +66,7 @@ static void test_reshape_in_expr() {
   }
 }
 
-
+template<typename>
 static void test_reshape_as_lvalue()
 {
   Tensor<float, 3> tensor(2,3,7);
@@ -114,6 +115,7 @@ static void test_simple_slice()
   }
 }
 
+template<typename=void>
 static void test_const_slice()
 {
   const float b[1] = {42};
@@ -459,25 +461,25 @@ static void test_composition()
 
 void test_cxx11_tensor_morphing()
 {
-  CALL_SUBTEST(test_simple_reshape());
-  CALL_SUBTEST(test_reshape_in_expr());
-  CALL_SUBTEST(test_reshape_as_lvalue());
-
-  CALL_SUBTEST(test_simple_slice<ColMajor>());
-  CALL_SUBTEST(test_simple_slice<RowMajor>());
-  CALL_SUBTEST(test_const_slice());
-  CALL_SUBTEST(test_slice_in_expr<ColMajor>());
-  CALL_SUBTEST(test_slice_in_expr<RowMajor>());
-  CALL_SUBTEST(test_slice_as_lvalue<ColMajor>());
-  CALL_SUBTEST(test_slice_as_lvalue<RowMajor>());
-  CALL_SUBTEST(test_slice_raw_data<ColMajor>());
-  CALL_SUBTEST(test_slice_raw_data<RowMajor>());
-
-  CALL_SUBTEST(test_strided_slice_write<ColMajor>());
-  CALL_SUBTEST(test_strided_slice<ColMajor>());
-  CALL_SUBTEST(test_strided_slice_write<RowMajor>());
-  CALL_SUBTEST(test_strided_slice<RowMajor>());
-
-  CALL_SUBTEST(test_composition<ColMajor>());
-  CALL_SUBTEST(test_composition<RowMajor>());
+  CALL_SUBTEST_1(test_simple_reshape<void>());
+  CALL_SUBTEST_1(test_reshape_in_expr<void>());
+  CALL_SUBTEST_1(test_reshape_as_lvalue<void>());
+
+  CALL_SUBTEST_1(test_simple_slice<ColMajor>());
+  CALL_SUBTEST_1(test_simple_slice<RowMajor>());
+  CALL_SUBTEST_1(test_const_slice());
+  CALL_SUBTEST_2(test_slice_in_expr<ColMajor>());
+  CALL_SUBTEST_3(test_slice_in_expr<RowMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<ColMajor>());
+  CALL_SUBTEST_4(test_slice_as_lvalue<RowMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<ColMajor>());
+  CALL_SUBTEST_5(test_slice_raw_data<RowMajor>());
+
+  CALL_SUBTEST_6(test_strided_slice_write<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice<ColMajor>());
+  CALL_SUBTEST_6(test_strided_slice_write<RowMajor>());
+  CALL_SUBTEST_6(test_strided_slice<RowMajor>());
+
+  CALL_SUBTEST_7(test_composition<ColMajor>());
+  CALL_SUBTEST_7(test_composition<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 34e9f54a0..a6375d34a 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -13,14 +13,53 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#include <cuda_fp16.h>
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
+template<typename>
+void test_cuda_numext() {
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+  int num_elem = 101;
+
+  float* d_float = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  bool* d_res_half = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+  bool* d_res_float = (bool*)gpu_device.allocate(num_elem * sizeof(bool));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float(
+      d_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_half(
+      d_res_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<bool, 1>, Eigen::Aligned> gpu_res_float(
+      d_res_float, num_elem);
+
+  gpu_float.device(gpu_device) = gpu_float.random() - gpu_float.constant(0.5f);
+  gpu_res_float.device(gpu_device) = gpu_float.unaryExpr(Eigen::internal::scalar_isnan_op<float>());
+  gpu_res_half.device(gpu_device) = gpu_float.cast<Eigen::half>().unaryExpr(Eigen::internal::scalar_isnan_op<Eigen::half>());
+
+  Tensor<bool, 1> half_prec(num_elem);
+  Tensor<bool, 1> full_prec(num_elem);
+  gpu_device.memcpyDeviceToHost(half_prec.data(), d_res_half, num_elem*sizeof(bool));
+  gpu_device.memcpyDeviceToHost(full_prec.data(), d_res_float, num_elem*sizeof(bool));
+  gpu_device.synchronize();
+
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking numext " << i << std::endl;
+    VERIFY_IS_EQUAL(full_prec(i), half_prec(i));
+  }
+
+  gpu_device.deallocate(d_float);
+  gpu_device.deallocate(d_res_half);
+  gpu_device.deallocate(d_res_float);
+}
+
+
 #ifdef EIGEN_HAS_CUDA_FP16
 
+template<typename>
 void test_cuda_conversion() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -55,7 +94,7 @@ void test_cuda_conversion() {
   gpu_device.deallocate(d_conv);
 }
 
-
+template<typename>
 void test_cuda_unary() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -92,7 +131,7 @@ void test_cuda_unary() {
   gpu_device.deallocate(d_res_float);
 }
 
-
+template<typename>
 void test_cuda_elementwise() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -134,6 +173,7 @@ void test_cuda_elementwise() {
   gpu_device.deallocate(d_res_float);
 }
 
+template<typename>
 void test_cuda_trancendental() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -141,30 +181,39 @@ void test_cuda_trancendental() {
 
   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
-      d_float1, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
-      d_float2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(
-      d_res1_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(
-      d_res1_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(
-      d_res2_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(
-      d_res2_float, num_elem);
+  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
 
   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
   gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(gpu_device) = gpu_float3.random();
   gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
   gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
-  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp();
-  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log();
+  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
 
   Tensor<float, 1> input1(num_elem);
   Tensor<Eigen::half, 1> half_prec1(num_elem);
@@ -172,12 +221,18 @@ void test_cuda_trancendental() {
   Tensor<float, 1> input2(num_elem);
   Tensor<Eigen::half, 1> half_prec2(num_elem);
   Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
   gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
   gpu_device.synchronize();
 
   for (int i = 0; i < num_elem; ++i) {
@@ -186,17 +241,27 @@ void test_cuda_trancendental() {
   }
   for (int i = 0; i < num_elem; ++i) {
     std::cout << "Checking elemwise log " << i << " input = " << input2(i) << " full = " << full_prec2(i) << " half = " << half_prec2(i) << std::endl;
-    VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+    if(std::abs(input2(i)-1.f)<0.05f) // log lacks accurary nearby 1
+      VERIFY_IS_APPROX(full_prec2(i)+Eigen::half(0.1f), half_prec2(i)+Eigen::half(0.1f));
+    else
+      VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
+  }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
   }
   gpu_device.deallocate(d_float1);
   gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float3);
   gpu_device.deallocate(d_res1_half);
   gpu_device.deallocate(d_res1_float);
   gpu_device.deallocate(d_res2_half);
   gpu_device.deallocate(d_res2_float);
+  gpu_device.deallocate(d_res3_float);
+  gpu_device.deallocate(d_res3_half);
 }
 
-
+template<typename>
 void test_cuda_contractions() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -247,7 +312,7 @@ void test_cuda_contractions() {
   gpu_device.deallocate(d_res_float);
 }
 
-
+template<typename>
 void test_cuda_reductions(int size1, int size2, int redux) {
 
    std::cout << "Reducing " << size1 << " by " << size2
@@ -296,17 +361,19 @@ void test_cuda_reductions(int size1, int size2, int redux) {
   gpu_device.deallocate(d_res_float);
 }
 
+template<typename>
 void test_cuda_reductions() {
-  test_cuda_reductions(13, 13, 0);
-  test_cuda_reductions(13, 13, 1);
+  test_cuda_reductions<void>(13, 13, 0);
+  test_cuda_reductions<void>(13, 13, 1);
 
-  test_cuda_reductions(35, 36, 0);
-  test_cuda_reductions(35, 36, 1);
+  test_cuda_reductions<void>(35, 36, 0);
+  test_cuda_reductions<void>(35, 36, 1);
 
-  test_cuda_reductions(36, 35, 0);
-  test_cuda_reductions(36, 35, 1);
+  test_cuda_reductions<void>(36, 35, 0);
+  test_cuda_reductions<void>(36, 35, 1);
 }
 
+template<typename>
 void test_cuda_full_reductions() {
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
@@ -355,7 +422,7 @@ void test_cuda_full_reductions() {
   gpu_device.deallocate(d_res_float);
 }
 
-
+template<typename>
 void test_cuda_forced_evals() {
 
   Eigen::CudaStreamDevice stream;
@@ -408,15 +475,17 @@ void test_cuda_forced_evals() {
 
 void test_cxx11_tensor_of_float16_cuda()
 {
+  CALL_SUBTEST_1(test_cuda_numext<void>());
+
 #ifdef EIGEN_HAS_CUDA_FP16
-  CALL_SUBTEST_1(test_cuda_conversion());
-  CALL_SUBTEST_1(test_cuda_unary());
-  CALL_SUBTEST_1(test_cuda_elementwise());
-  CALL_SUBTEST_1(test_cuda_trancendental());
-  CALL_SUBTEST_2(test_cuda_contractions());
-  CALL_SUBTEST_3(test_cuda_reductions());
-  CALL_SUBTEST_4(test_cuda_full_reductions());
-  CALL_SUBTEST_5(test_cuda_forced_evals());
+  CALL_SUBTEST_1(test_cuda_conversion<void>());
+  CALL_SUBTEST_1(test_cuda_unary<void>());
+  CALL_SUBTEST_1(test_cuda_elementwise<void>());
+  CALL_SUBTEST_1(test_cuda_trancendental<void>());
+  CALL_SUBTEST_2(test_cuda_contractions<void>());
+  CALL_SUBTEST_3(test_cuda_reductions<void>());
+  CALL_SUBTEST_4(test_cuda_full_reductions<void>());
+  CALL_SUBTEST_5(test_cuda_forced_evals<void>());
 #else
   std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl;
 #endif
diff --git a/unsupported/test/cxx11_tensor_reduction.cpp b/unsupported/test/cxx11_tensor_reduction.cpp
index ca483257b..1490ec3da 100644
--- a/unsupported/test/cxx11_tensor_reduction.cpp
+++ b/unsupported/test/cxx11_tensor_reduction.cpp
@@ -239,6 +239,33 @@ static void test_simple_reductions() {
   }
 }
 
+
+template <int DataLayout>
+static void test_reductions_in_expr() {
+  Tensor<float, 4, DataLayout> tensor(2, 3, 5, 7);
+  tensor.setRandom();
+  array<ptrdiff_t, 2> reduction_axis2;
+  reduction_axis2[0] = 1;
+  reduction_axis2[1] = 3;
+
+  Tensor<float, 2, DataLayout> result(2, 5);
+  result = result.constant(1.0f) - tensor.sum(reduction_axis2);
+  VERIFY_IS_EQUAL(result.dimension(0), 2);
+  VERIFY_IS_EQUAL(result.dimension(1), 5);
+  for (int i = 0; i < 2; ++i) {
+    for (int j = 0; j < 5; ++j) {
+      float sum = 0.0f;
+      for (int k = 0; k < 3; ++k) {
+        for (int l = 0; l < 7; ++l) {
+          sum += tensor(i, k, j, l);
+        }
+      }
+      VERIFY_IS_APPROX(result(i, j), 1.0f - sum);
+    }
+  }
+}
+
+
 template <int DataLayout>
 static void test_full_reductions() {
   Tensor<float, 2, DataLayout> tensor(2, 3);
@@ -462,6 +489,8 @@ void test_cxx11_tensor_reduction() {
   CALL_SUBTEST(test_trivial_reductions<RowMajor>());
   CALL_SUBTEST(test_simple_reductions<ColMajor>());
   CALL_SUBTEST(test_simple_reductions<RowMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<ColMajor>());
+  CALL_SUBTEST(test_reductions_in_expr<RowMajor>());
   CALL_SUBTEST(test_full_reductions<ColMajor>());
   CALL_SUBTEST(test_full_reductions<RowMajor>());
   CALL_SUBTEST(test_user_defined_reductions<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index cad0c08e0..6d8f01c02 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -16,7 +16,7 @@
 #include <unsupported/Eigen/CXX11/Tensor>
 
 
-template<int DataLayout>
+template<typename Type, int DataLayout>
 static void test_full_reductions() {
 
   Eigen::CudaStreamDevice stream;
@@ -25,24 +25,24 @@ static void test_full_reductions() {
   const int num_rows = internal::random<int>(1024, 5*1024);
   const int num_cols = internal::random<int>(1024, 5*1024);
 
-  Tensor<float, 2, DataLayout> in(num_rows, num_cols);
+  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
   in.setRandom();
 
-  Tensor<float, 0, DataLayout> full_redux;
+  Tensor<Type, 0, DataLayout> full_redux;
   full_redux = in.sum();
 
-  std::size_t in_bytes = in.size() * sizeof(float);
-  std::size_t out_bytes = full_redux.size() * sizeof(float);
-  float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
-  float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
+  std::size_t in_bytes = in.size() * sizeof(Type);
+  std::size_t out_bytes = full_redux.size() * sizeof(Type);
+  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
 
-  TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
-  TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr);
+  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
 
   out_gpu.device(gpu_device) = in_gpu.sum();
 
-  Tensor<float, 0, DataLayout> full_redux_gpu;
+  Tensor<Type, 0, DataLayout> full_redux_gpu;
   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
   gpu_device.synchronize();
 
@@ -54,6 +54,8 @@ static void test_full_reductions() {
 }
 
 void test_cxx11_tensor_reduction_cuda() {
-  CALL_SUBTEST_1(test_full_reductions<ColMajor>());
-  CALL_SUBTEST_2(test_full_reductions<RowMajor>());
+  CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+  CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
 }
diff --git a/unsupported/test/cxx11_tensor_scan.cpp b/unsupported/test/cxx11_tensor_scan.cpp
index dbd3023d7..af59aa3ef 100644
--- a/unsupported/test/cxx11_tensor_scan.cpp
+++ b/unsupported/test/cxx11_tensor_scan.cpp
@@ -14,63 +14,73 @@
 
 using Eigen::Tensor;
 
-template <int DataLayout, typename Type=float>
+template <int DataLayout, typename Type=float, bool Exclusive = false>
 static void test_1d_scan()
 {
-    int size = 50;
-    Tensor<Type, 1, DataLayout> tensor(size);
-    tensor.setRandom();
-    Tensor<Type, 1, DataLayout> result = tensor.cumsum(0);
+  int size = 50;
+  Tensor<Type, 1, DataLayout> tensor(size);
+  tensor.setRandom();
+  Tensor<Type, 1, DataLayout> result = tensor.cumsum(0, Exclusive);
 
-    VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
+  VERIFY_IS_EQUAL(tensor.dimension(0), result.dimension(0));
 
-    float accum = 0;
-    for (int i = 0; i < size; i++) {
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum += tensor(i);
+    } else {
       accum += tensor(i);
       VERIFY_IS_EQUAL(result(i), accum);
     }
+  }
 
-    accum = 1;
-    result = tensor.cumprod(0);
-    for (int i = 0; i < size; i++) {
+  accum = 1;
+  result = tensor.cumprod(0, Exclusive);
+  for (int i = 0; i < size; i++) {
+    if (Exclusive) {
+      VERIFY_IS_EQUAL(result(i), accum);
+      accum *= tensor(i);
+    } else {
       accum *= tensor(i);
       VERIFY_IS_EQUAL(result(i), accum);
     }
+  }
 }
 
 template <int DataLayout, typename Type=float>
 static void test_4d_scan()
 {
-    int size = 5;
-    Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
-    tensor.setRandom();
+  int size = 5;
+  Tensor<Type, 4, DataLayout> tensor(size, size, size, size);
+  tensor.setRandom();
 
-    Tensor<Type, 4, DataLayout> result(size, size, size, size);
+  Tensor<Type, 4, DataLayout> result(size, size, size, size);
 
-    result = tensor.cumsum(0);
-    float accum = 0;
-    for (int i = 0; i < size; i++) {
-      accum += tensor(i, 0, 0, 0);
-      VERIFY_IS_EQUAL(result(i, 0, 0, 0), accum);
-    }
-    result = tensor.cumsum(1);
-    accum = 0;
-    for (int i = 0; i < size; i++) {
-      accum += tensor(0, i, 0, 0);
-      VERIFY_IS_EQUAL(result(0, i, 0, 0), accum);
-    }
-    result = tensor.cumsum(2);
-    accum = 0;
-    for (int i = 0; i < size; i++) {
-      accum += tensor(0, 0, i, 0);
-      VERIFY_IS_EQUAL(result(0, 0, i, 0), accum);
-    }
-    result = tensor.cumsum(3);
-    accum = 0;
-    for (int i = 0; i < size; i++) {
-      accum += tensor(0, 0, 0, i);
-      VERIFY_IS_EQUAL(result(0, 0, 0, i), accum);
-    }
+  result = tensor.cumsum(0);
+  float accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(i, 1, 2, 3);
+    VERIFY_IS_EQUAL(result(i, 1, 2, 3), accum);
+  }
+  result = tensor.cumsum(1);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, i, 2, 3);
+    VERIFY_IS_EQUAL(result(1, i, 2, 3), accum);
+  }
+  result = tensor.cumsum(2);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, i, 3);
+    VERIFY_IS_EQUAL(result(1, 2, i, 3), accum);
+  }
+  result = tensor.cumsum(3);
+  accum = 0;
+  for (int i = 0; i < size; i++) {
+    accum += tensor(1, 2, 3, i);
+    VERIFY_IS_EQUAL(result(1, 2, 3, i), accum);
+  }
 }
 
 template <int DataLayout>
@@ -89,8 +99,10 @@ static void test_tensor_maps() {
 }
 
 void test_cxx11_tensor_scan() {
-  CALL_SUBTEST(test_1d_scan<ColMajor>());
-  CALL_SUBTEST(test_1d_scan<RowMajor>());
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<ColMajor, float, false>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, true>()));
+  CALL_SUBTEST((test_1d_scan<RowMajor, float, false>()));
   CALL_SUBTEST(test_4d_scan<ColMajor>());
   CALL_SUBTEST(test_4d_scan<RowMajor>());
   CALL_SUBTEST(test_tensor_maps<ColMajor>());
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
new file mode 100644
index 000000000..35e19e51c
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -0,0 +1,77 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_NO_COMPLEX
+#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_USE_GPU
+
+
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+typedef Tensor<float, 1>::DimensionPair DimPair;
+
+template<int DataLayout>
+void test_cuda_cumsum(int m_size, int k_size, int n_size)
+{
+  std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
+  Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result(m_size, k_size, n_size);
+  Tensor<float, 3, DataLayout> t_result_gpu(m_size, k_size, n_size);
+
+  t_input.setRandom();
+
+  std::size_t t_input_bytes = t_input.size()  * sizeof(float);
+  std::size_t t_result_bytes = t_result.size() * sizeof(float);
+
+  float* d_t_input;
+  float* d_t_result;
+
+  cudaMalloc((void**)(&d_t_input), t_input_bytes);
+  cudaMalloc((void**)(&d_t_result), t_result_bytes);
+
+  cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_input(d_t_input, Eigen::array<int, 3>(m_size, k_size, n_size));
+  Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> >
+      gpu_t_result(d_t_result, Eigen::array<int, 3>(m_size, k_size, n_size));
+
+  gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1);
+  t_result = t_input.cumsum(1);
+
+  cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+  for (size_t i = 0; i < t_result.size(); i++) {
+    if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
+      continue;
+    }
+    if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), 1e-4f)) {
+      continue;
+    }
+    std::cout << "mismatch detected at index " << i << ": " << t_result(i)
+              << " vs " <<  t_result_gpu(i) << std::endl;
+    assert(false);
+  }
+
+  cudaFree((void*)d_t_input);
+  cudaFree((void*)d_t_result);
+}
+
+
+void test_cxx11_tensor_scan_cuda()
+{
+  CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128));
+  CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128));
+}
diff --git a/unsupported/test/cxx11_tensor_sugar.cpp b/unsupported/test/cxx11_tensor_sugar.cpp
index a03f75cfe..2f56eb495 100644
--- a/unsupported/test/cxx11_tensor_sugar.cpp
+++ b/unsupported/test/cxx11_tensor_sugar.cpp
@@ -33,7 +33,7 @@ static void test_comparison_sugar() {
 }
 
 
-static void test_scalar_sugar() {
+static void test_scalar_sugar_add_mul() {
   Tensor<float, 3> A(6, 7, 5);
   Tensor<float, 3> B(6, 7, 5);
   A.setRandom();
@@ -41,21 +41,41 @@ static void test_scalar_sugar() {
 
   const float alpha = 0.43f;
   const float beta = 0.21f;
+  const float gamma = 0.14f;
 
-  Tensor<float, 3> R = A * A.constant(alpha) + B * B.constant(beta);
-  Tensor<float, 3> S = A * alpha + B * beta;
-
-  // TODO: add enough syntactic sugar to support this
-  // Tensor<float, 3> T = alpha * A + beta * B;
+  Tensor<float, 3> R = A.constant(gamma) + A * A.constant(alpha) + B * B.constant(beta);
+  Tensor<float, 3> S = A * alpha + B * beta + gamma;
+  Tensor<float, 3> T = gamma + alpha * A + beta * B;
 
   for (int i = 0; i < 6*7*5; ++i) {
     VERIFY_IS_APPROX(R(i), S(i));
+    VERIFY_IS_APPROX(R(i), T(i));
   }
 }
 
+static void test_scalar_sugar_sub_div() {
+  Tensor<float, 3> A(6, 7, 5);
+  Tensor<float, 3> B(6, 7, 5);
+  A.setRandom();
+  B.setRandom();
+
+  const float alpha = 0.43f;
+  const float beta = 0.21f;
+  const float gamma = 0.14f;
+  const float delta = 0.32f;
+
+  Tensor<float, 3> R = A.constant(gamma) - A / A.constant(alpha)
+      - B.constant(beta) / B - A.constant(delta);
+  Tensor<float, 3> S = gamma - A / alpha - beta / B - delta;
+
+  for (int i = 0; i < 6*7*5; ++i) {
+    VERIFY_IS_APPROX(R(i), S(i));
+  }
+}
 
 void test_cxx11_tensor_sugar()
 {
   CALL_SUBTEST(test_comparison_sugar());
-  CALL_SUBTEST(test_scalar_sugar());
+  CALL_SUBTEST(test_scalar_sugar_add_mul());
+  CALL_SUBTEST(test_scalar_sugar_sub_div());
 }
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index 02411a262..e770049e5 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,12 +9,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_TEST_PART_1
 
 #include "sparse.h"
 #include <Eigen/SparseExtra>
 #include <Eigen/KroneckerProduct>
 
-
 template<typename MatrixType>
 void check_dimension(const MatrixType& ab, const int rows,  const int cols)
 {
@@ -230,3 +230,23 @@ void test_kronecker_product()
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
   }
 }
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+void test_kronecker_product()
+{
+  MatrixXd a(2,2), b(3,3), c;
+  a.setRandom();
+  b.setRandom();
+  c = kroneckerProduct(a,b);
+  VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
+}
+
+#endif
diff --git a/unsupported/test/mpreal_support.cpp b/unsupported/test/mpreal_support.cpp
index 1aa9e786a..ffa5691eb 100644
--- a/unsupported/test/mpreal_support.cpp
+++ b/unsupported/test/mpreal_support.cpp
@@ -17,6 +17,7 @@ void test_mpreal_support()
   std::cerr << "dummy_precision = " << NumTraits<mpreal>::dummy_precision() << "\n";
   std::cerr << "highest =         " << NumTraits<mpreal>::highest() << "\n";
   std::cerr << "lowest =          " << NumTraits<mpreal>::lowest() << "\n";
+  std::cerr << "digits10 =        " << NumTraits<mpreal>::digits10() << "\n";
 
   for(int i = 0; i < g_repeat; i++) {
     int s = Eigen::internal::random<int>(1,100);
diff --git a/unsupported/test/special_functions.cpp b/unsupported/test/special_functions.cpp
new file mode 100644
index 000000000..057fb3e92
--- /dev/null
+++ b/unsupported/test/special_functions.cpp
@@ -0,0 +1,345 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Gael Guennebaud <gael.guennebaud@inria.fr>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+#include "../Eigen/SpecialFunctions"
+
+template<typename X, typename Y>
+void verify_component_wise(const X& x, const Y& y)
+{
+  for(Index i=0; i<x.size(); ++i)
+  {
+    if((numext::isfinite)(y(i)))
+      VERIFY_IS_APPROX( x(i), y(i) );
+    else if((numext::isnan)(y(i)))
+      VERIFY((numext::isnan)(x(i)));
+    else
+      VERIFY_IS_EQUAL( x(i), y(i) );
+  }
+}
+
+template<typename ArrayType> void array_special_functions()
+{
+  using std::abs;
+  using std::sqrt;
+  typedef typename ArrayType::Scalar Scalar;
+  typedef typename NumTraits<Scalar>::Real RealScalar;
+
+  Scalar plusinf = std::numeric_limits<Scalar>::infinity();
+  Scalar nan = std::numeric_limits<Scalar>::quiet_NaN();
+
+  Index rows = internal::random<Index>(1,30);
+  Index cols = 1;
+
+  // API
+  {
+    ArrayType m1 = ArrayType::Random(rows,cols);
+#if EIGEN_HAS_C99_MATH
+    VERIFY_IS_APPROX(m1.lgamma(), lgamma(m1));
+    VERIFY_IS_APPROX(m1.digamma(), digamma(m1));
+    VERIFY_IS_APPROX(m1.erf(), erf(m1));
+    VERIFY_IS_APPROX(m1.erfc(), erfc(m1));
+#endif  // EIGEN_HAS_C99_MATH
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  // check special functions (comparing against numpy implementation)
+  if (!NumTraits<Scalar>::IsComplex)
+  {
+
+    {
+      ArrayType m1 = ArrayType::Random(rows,cols);
+      ArrayType m2 = ArrayType::Random(rows,cols);
+
+      // Test various propreties of igamma & igammac.  These are normalized
+      // gamma integrals where
+      //   igammac(a, x) = Gamma(a, x) / Gamma(a)
+      //   igamma(a, x) = gamma(a, x) / Gamma(a)
+      // where Gamma and gamma are considered the standard unnormalized
+      // upper and lower incomplete gamma functions, respectively.
+      ArrayType a = m1.abs() + 2;
+      ArrayType x = m2.abs() + 2;
+      ArrayType zero = ArrayType::Zero(rows, cols);
+      ArrayType one = ArrayType::Constant(rows, cols, Scalar(1.0));
+      ArrayType a_m1 = a - one;
+      ArrayType Gamma_a_x = Eigen::igammac(a, x) * a.lgamma().exp();
+      ArrayType Gamma_a_m1_x = Eigen::igammac(a_m1, x) * a_m1.lgamma().exp();
+      ArrayType gamma_a_x = Eigen::igamma(a, x) * a.lgamma().exp();
+      ArrayType gamma_a_m1_x = Eigen::igamma(a_m1, x) * a_m1.lgamma().exp();
+
+      // Gamma(a, 0) == Gamma(a)
+      VERIFY_IS_APPROX(Eigen::igammac(a, zero), one);
+
+      // Gamma(a, x) + gamma(a, x) == Gamma(a)
+      VERIFY_IS_APPROX(Gamma_a_x + gamma_a_x, a.lgamma().exp());
+
+      // Gamma(a, x) == (a - 1) * Gamma(a-1, x) + x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(Gamma_a_x, (a - 1) * Gamma_a_m1_x + x.pow(a-1) * (-x).exp());
+
+      // gamma(a, x) == (a - 1) * gamma(a-1, x) - x^(a-1) * exp(-x)
+      VERIFY_IS_APPROX(gamma_a_x, (a - 1) * gamma_a_m1_x - x.pow(a-1) * (-x).exp());
+    }
+
+    {
+      // Check exact values of igamma and igammac against a third party calculation.
+      Scalar a_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+      Scalar x_s[] = {Scalar(0), Scalar(1), Scalar(1.5), Scalar(4), Scalar(0.0001), Scalar(1000.5)};
+
+      // location i*6+j corresponds to a_s[i], x_s[j].
+      Scalar igamma_s[][6] = {{0.0, nan, nan, nan, nan, nan},
+                              {0.0, 0.6321205588285578, 0.7768698398515702,
+                              0.9816843611112658, 9.999500016666262e-05, 1.0},
+                              {0.0, 0.4275932955291202, 0.608374823728911,
+                              0.9539882943107686, 7.522076445089201e-07, 1.0},
+                              {0.0, 0.01898815687615381, 0.06564245437845008,
+                              0.5665298796332909, 4.166333347221828e-18, 1.0},
+                              {0.0, 0.9999780593618628, 0.9999899967080838,
+                              0.9999996219837988, 0.9991370418689945, 1.0},
+                              {0.0, 0.0, 0.0, 0.0, 0.0, 0.5042041932513908}};
+      Scalar igammac_s[][6] = {{nan, nan, nan, nan, nan, nan},
+                              {1.0, 0.36787944117144233, 0.22313016014842982,
+                                0.018315638888734182, 0.9999000049998333, 0.0},
+                              {1.0, 0.5724067044708798, 0.3916251762710878,
+                                0.04601170568923136, 0.9999992477923555, 0.0},
+                              {1.0, 0.9810118431238462, 0.9343575456215499,
+                                0.4334701203667089, 1.0, 0.0},
+                              {1.0, 2.1940638138146658e-05, 1.0003291916285e-05,
+                                3.7801620118431334e-07, 0.0008629581310054535,
+                                0.0},
+                              {1.0, 1.0, 1.0, 1.0, 1.0, 0.49579580674813944}};
+      for (int i = 0; i < 6; ++i) {
+        for (int j = 0; j < 6; ++j) {
+          if ((std::isnan)(igamma_s[i][j])) {
+            VERIFY((std::isnan)(numext::igamma(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igamma(a_s[i], x_s[j]), igamma_s[i][j]);
+          }
+
+          if ((std::isnan)(igammac_s[i][j])) {
+            VERIFY((std::isnan)(numext::igammac(a_s[i], x_s[j])));
+          } else {
+            VERIFY_IS_APPROX(numext::igammac(a_s[i], x_s[j]), igammac_s[i][j]);
+          }
+        }
+      }
+    }
+  }
+#endif  // EIGEN_HAS_C99_MATH
+
+  // Check the zeta function against scipy.special.zeta
+  {
+    ArrayType x(7), q(7), res(7), ref(7);
+    x << 1.5,   4, 10.5, 10000.5,    3, 1,        0.9;
+    q << 2,   1.5,    3,  1.0001, -2.5, 1.2345, 1.2345;
+    ref << 1.61237534869, 0.234848505667, 1.03086757337e-5, 0.367879440865, 0.054102025820864097, plusinf, nan;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+    CALL_SUBTEST( res = x.zeta(q); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = zeta(x,q); verify_component_wise(res, ref); );
+  }
+
+  // digamma
+  {
+    ArrayType x(7), res(7), ref(7);
+    x << 1, 1.5, 4, -10.5, 10000.5, 0, -1;
+    ref << -0.5772156649015329, 0.03648997397857645, 1.2561176684318, 2.398239129535781, 9.210340372392849, plusinf, plusinf;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    CALL_SUBTEST( res = x.digamma(); verify_component_wise(res, ref); );
+    CALL_SUBTEST( res = digamma(x);  verify_component_wise(res, ref); );
+  }
+
+
+#if EIGEN_HAS_C99_MATH
+  {
+    ArrayType n(11), x(11), res(11), ref(11);
+    n << 1, 1,    1, 1.5,   17,   31,   28,    8, 42, 147, 170;
+    x << 2, 3, 25.5, 1.5,  4.7, 11.8, 17.7, 30.2, 15.8, 54.1, 64;
+    ref << 0.644934066848, 0.394934066848, 0.0399946696496, nan, 293.334565435, 0.445487887616, -2.47810300902e-07, -8.29668781082e-09, -0.434562276666, 0.567742190178, -0.0108615497927;
+    CALL_SUBTEST( verify_component_wise(ref, ref); );
+
+    if(sizeof(RealScalar)>=8) {  // double
+      // Reason for commented line: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res, ref); );
+      CALL_SUBTEST( res = polygamma(n,x);  verify_component_wise(res, ref); );
+    }
+    else {
+      //       CALL_SUBTEST( res = x.polygamma(n); verify_component_wise(res.head(8), ref.head(8)); );
+      CALL_SUBTEST( res = polygamma(n,x); verify_component_wise(res.head(8), ref.head(8)); );
+    }
+  }
+#endif
+
+#if EIGEN_HAS_C99_MATH
+  {
+    // Inputs and ground truth generated with scipy via:
+    //   a = np.logspace(-3, 3, 5) - 1e-3
+    //   b = np.logspace(-3, 3, 5) - 1e-3
+    //   x = np.linspace(-0.1, 1.1, 5)
+    //   (full_a, full_b, full_x) = np.vectorize(lambda a, b, x: (a, b, x))(*np.ix_(a, b, x))
+    //   full_a = full_a.flatten().tolist()  # same for full_b, full_x
+    //   v = scipy.special.betainc(full_a, full_b, full_x).flatten().tolist()
+    //
+    // Note in Eigen, we call betainc with arguments in the order (x, a, b).
+    ArrayType a(125);
+    ArrayType b(125);
+    ArrayType x(125);
+    ArrayType v(125);
+    ArrayType res(125);
+
+    a << 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999, 999.999,
+        999.999, 999.999, 999.999;
+
+    b << 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379, 0.999,
+        0.999, 0.999, 0.999, 0.999, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 31.62177660168379, 999.999,
+        999.999, 999.999, 999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.999, 0.999, 0.999, 0.999,
+        0.999, 31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03062277660168379,
+        0.03062277660168379, 0.03062277660168379, 0.03062277660168379,
+        0.03062277660168379, 0.999, 0.999, 0.999, 0.999, 0.999,
+        31.62177660168379, 31.62177660168379, 31.62177660168379,
+        31.62177660168379, 31.62177660168379, 999.999, 999.999, 999.999,
+        999.999, 999.999;
+
+    x << -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1,
+        -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8,
+        1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2,
+        0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1,
+        0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5, 0.8, 1.1, -0.1, 0.2, 0.5,
+        0.8, 1.1;
+
+    v << nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan, nan,
+        nan, nan, nan, 0.47972119876364683, 0.5, 0.5202788012363533, nan, nan,
+        0.9518683957740043, 0.9789663010413743, 0.9931729188073435, nan, nan,
+        0.999995949033062, 0.9999999999993698, 0.9999999999999999, nan, nan,
+        0.9999999999999999, 0.9999999999999999, 0.9999999999999999, nan, nan,
+        nan, nan, nan, nan, nan, 0.006827081192655869, 0.0210336989586256,
+        0.04813160422599567, nan, nan, 0.20014344256217678, 0.5000000000000001,
+        0.7998565574378232, nan, nan, 0.9991401428435834, 0.999999999698403,
+        0.9999999999999999, nan, nan, 0.9999999999999999, 0.9999999999999999,
+        0.9999999999999999, nan, nan, nan, nan, nan, nan, nan,
+        1.0646600232370887e-25, 6.301722877826246e-13, 4.050966937974938e-06,
+        nan, nan, 7.864342668429763e-23, 3.015969667594166e-10,
+        0.0008598571564165444, nan, nan, 6.031987710123844e-08,
+        0.5000000000000007, 0.9999999396801229, nan, nan, 0.9999999999999999,
+        0.9999999999999999, 0.9999999999999999, nan, nan, nan, nan, nan, nan,
+        nan, 0.0, 7.029920380986636e-306, 2.2450728208591345e-101, nan, nan,
+        0.0, 9.275871147869727e-302, 1.2232913026152827e-97, nan, nan, 0.0,
+        3.0891393081932924e-252, 2.9303043666183996e-60, nan, nan,
+        2.248913486879199e-196, 0.5000000000004947, 0.9999999999999999, nan;
+
+    CALL_SUBTEST(res = betainc(a, b, x);
+                 verify_component_wise(res, v););
+  }
+
+  // Test various properties of betainc
+  {
+    ArrayType m1 = ArrayType::Random(32);
+    ArrayType m2 = ArrayType::Random(32);
+    ArrayType m3 = ArrayType::Random(32);
+    ArrayType one = ArrayType::Constant(32, Scalar(1.0));
+    const Scalar eps = std::numeric_limits<Scalar>::epsilon();
+    ArrayType a = (m1 * 4.0).exp();
+    ArrayType b = (m2 * 4.0).exp();
+    ArrayType x = m3.abs();
+
+    // betainc(a, 1, x) == x**a
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, one, x);
+        ArrayType expected = x.pow(a);
+        verify_component_wise(test, expected););
+
+    // betainc(1, b, x) == 1 - (1 - x)**b
+    CALL_SUBTEST(
+        ArrayType test = betainc(one, b, x);
+        ArrayType expected = one - (one - x).pow(b);
+        verify_component_wise(test, expected););
+
+    // betainc(a, b, x) == 1 - betainc(b, a, 1-x)
+    CALL_SUBTEST(
+        ArrayType test = betainc(a, b, x) + betainc(b, a, one - x);
+        ArrayType expected = one;
+        verify_component_wise(test, expected););
+
+    // betainc(a+1, b, x) = betainc(a, b, x) - x**a * (1 - x)**b / (a * beta(a, b))
+    CALL_SUBTEST(
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = a * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType expected = betainc(a, b, x) - num / denom + eps;
+        ArrayType test = betainc(a + one, b, x) + eps;
+        if (sizeof(Scalar) >= 8) { // double
+          verify_component_wise(test, expected);
+        } else {
+          // Reason for limited test: http://eigen.tuxfamily.org/bz/show_bug.cgi?id=1232
+          verify_component_wise(test.head(8), expected.head(8));
+        });
+
+    // betainc(a, b+1, x) = betainc(a, b, x) + x**a * (1 - x)**b / (b * beta(a, b))
+    CALL_SUBTEST(
+        // Add eps to rhs and lhs so that component-wise test doesn't result in
+        // nans when both outputs are zeros.
+        ArrayType num = x.pow(a) * (one - x).pow(b);
+        ArrayType denom = b * (a.lgamma() + b.lgamma() - (a + b).lgamma()).exp();
+        ArrayType expected = betainc(a, b, x) + num / denom + eps;
+        ArrayType test = betainc(a, b + one, x) + eps;
+        verify_component_wise(test, expected););
+  }
+#endif
+}
+
+void test_special_functions()
+{
+  CALL_SUBTEST_1(array_special_functions<ArrayXf>());
+  CALL_SUBTEST_2(array_special_functions<ArrayXd>());
+}