Merged with default.

author: Luke Iwanski <luke@codeplay.com> 2016-09-19 14:03:54 +0100
committer: Luke Iwanski <luke@codeplay.com> 2016-09-19 14:03:54 +0100
commit: b91e0211727b9ea5d7c30908ed86afc4e50d4c6c (patch)
tree: 4dd857d1aabdd4065e8b7dec1369a4bd06bac5e7 /unsupported
parent: cb81975714a96ecb2faf33ca242feeee3543b1db (diff)
parent: ff47717f25aeede4878f65b214cdce264b8314e8 (diff)
57 files changed, 1575 insertions, 339 deletions
diff --git a/unsupported/Eigen/CMakeLists.txt b/unsupported/Eigen/CMakeLists.txt
index 7478b6b0d..631a06014 100644
--- a/unsupported/Eigen/CMakeLists.txt
+++ b/unsupported/Eigen/CMakeLists.txt
@@ -4,6 +4,7 @@ set(Eigen_HEADERS
   ArpackSupport
   AutoDiff
   BVH
+  EulerAngles
   FFT
   IterativeSolvers 
   KroneckerProduct
@@ -26,5 +27,6 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel
   )
 
-add_subdirectory(src)
-add_subdirectory(CXX11)
-\ No newline at end of file
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen COMPONENT Devel FILES_MATCHING PATTERN "*.h")
+
+add_subdirectory(CXX11)
diff --git a/unsupported/Eigen/CXX11/CMakeLists.txt b/unsupported/Eigen/CXX11/CMakeLists.txt
index a40bc4715..385ed240c 100644
--- a/unsupported/Eigen/CXX11/CMakeLists.txt
+++ b/unsupported/Eigen/CXX11/CMakeLists.txt
@@ -5,4 +5,4 @@ install(FILES
   DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel
   )
 
-add_subdirectory(src)
+install(DIRECTORY src DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11 COMPONENT Devel FILES_MATCHING PATTERN "*.h")
diff --git a/unsupported/Eigen/CXX11/src/CMakeLists.txt b/unsupported/Eigen/CXX11/src/CMakeLists.txt
deleted file mode 100644
index 1734262bb..000000000
--- a/unsupported/Eigen/CXX11/src/CMakeLists.txt
+++ /dev/null
@@ -1,4 +0,0 @@
-add_subdirectory(util)
-add_subdirectory(ThreadPool)
-add_subdirectory(Tensor)
-add_subdirectory(TensorSymmetry)
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_Tensor_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index e3880d2e0..3c8710255 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
       return unaryExpr(internal::scalar_abs_op<Scalar>());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 56d9c2025..20b29e5fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
-                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+                               typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
   typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                         typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
-                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index a76c8ca35..d66e45d50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -91,21 +91,21 @@ class TensorOpCost {
   }
 
   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index b2b4bcf62..834ce07df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -239,7 +239,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
 
   typedef typename XprType::Index Index;
@@ -256,13 +256,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
-    return m_functor(index);
+    return m_wrapper(m_functor, index);
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    return m_functor.template packetOp<Index, PacketReturnType>(index);
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -282,6 +282,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
  private:
   const NullaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
 };
 
 
@@ -612,7 +613,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
   /// required by sycl in order to extract the accessor
   const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
   /// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index a8e48fced..fc75dbb5c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,7 @@ struct scalar_mod_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 
 /** \internal
@@ -38,7 +38,7 @@ struct scalar_mod2_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 template <typename Scalar>
 struct scalar_fmod_op {
@@ -188,6 +188,32 @@ struct reducer_traits<MeanReducer<T>, Device> {
 };
 
 
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return -Eigen::NumTraits<T>::infinity();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return Eigen::NumTraits<T>::infinity();
+  }
+};
+
+
 template <typename T> struct MaxReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -200,9 +226,8 @@ template <typename T> struct MaxReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmax<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::lowest();
+    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -242,9 +267,8 @@ template <typename T> struct MinReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmin<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::highest();
+    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -454,12 +478,11 @@ template <typename T> class UniformRandomGenerator {
     m_deterministic = other.m_deterministic;
   }
 
-  template<typename Index>
-  T operator()(Index) const {
+  T operator()() const {
     return random<T>();
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
@@ -484,23 +507,22 @@ template <> class UniformRandomGenerator<float> {
   }
   UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
     m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
+    m_generator->seed(other() * UINT_MAX);
     m_deterministic = other.m_deterministic;
   }
   ~UniformRandomGenerator() {
     delete m_generator;
   }
 
-  template<typename Index>
-  float operator()(Index) const {
+  float operator()() const {
     return m_distribution(*m_generator);
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX float values[packetSize];
     for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
+      values[k] = this->operator()();
     }
     return internal::pload<PacketType>(values);
   }
@@ -525,23 +547,22 @@ template <> class UniformRandomGenerator<double> {
   }
   UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
     m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
+    m_generator->seed(other() * UINT_MAX);
     m_deterministic = other.m_deterministic;
   }
   ~UniformRandomGenerator() {
     delete m_generator;
   }
 
-  template<typename Index>
-  double operator()(Index) const {
+  double operator()() const {
     return m_distribution(*m_generator);
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX double values[packetSize];
     for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
+      values[k] = this->operator()();
     }
     return internal::pload<PacketType>(values);
   }
@@ -578,12 +599,11 @@ template <> class UniformRandomGenerator<float> {
      curand_init(seed, tid, 0, &m_state);
   }
 
-  template<typename Index>
-  __device__ float operator()(Index) const {
+  __device__ float operator()() const {
     return curand_uniform(&m_state);
   }
-  template<typename Index, typename PacketType>
-  __device__ float4 packetOp(Index) const {
+  template<typename PacketType>
+  __device__ float4 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_uniform4(&m_state);
   }
@@ -608,12 +628,11 @@ template <> class UniformRandomGenerator<double> {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ double operator()(Index) const {
+  __device__ double operator()() const {
     return curand_uniform_double(&m_state);
   }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
+  template<typename PacketType>
+  __device__ double2 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_uniform2_double(&m_state);
   }
@@ -638,8 +657,7 @@ template <> class UniformRandomGenerator<std::complex<float> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
+  __device__ std::complex<float> operator()() const {
     float4 vals = curand_uniform4(&m_state);
     return std::complex<float>(vals.x, vals.y);
   }
@@ -664,8 +682,7 @@ template <> class UniformRandomGenerator<std::complex<double> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
+  __device__ std::complex<double> operator()() const {
     double2 vals = curand_uniform2_double(&m_state);
     return std::complex<double>(vals.x, vals.y);
   }
@@ -701,17 +718,16 @@ template <typename T> class NormalRandomGenerator {
   }
   NormalRandomGenerator(const NormalRandomGenerator& other)
       : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
-    m_generator->seed(other(0) * UINT_MAX);
+    m_generator->seed(other() * UINT_MAX);
   }
   ~NormalRandomGenerator() {
     delete m_generator;
   }
-  template<typename Index>
-  T operator()(Index) const {
+  T operator()() const {
     return m_distribution(*m_generator);
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
@@ -749,12 +765,11 @@ template <> class NormalRandomGenerator<float> {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ float operator()(Index) const {
+  __device__ float operator()() const {
     return curand_normal(&m_state);
   }
-  template<typename Index, typename PacketType>
-   __device__ float4 packetOp(Index) const {
+  template<typename PacketType>
+   __device__ float4 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_normal4(&m_state);
   }
@@ -779,12 +794,11 @@ template <> class NormalRandomGenerator<double> {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ double operator()(Index) const {
+  __device__ double operator()() const {
     return curand_normal_double(&m_state);
   }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
+  template<typename PacketType>
+  __device__ double2 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_normal2_double(&m_state);
   }
@@ -809,8 +823,7 @@ template <> class NormalRandomGenerator<std::complex<float> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
+  __device__ std::complex<float> operator()() const {
     float4 vals = curand_normal4(&m_state);
     return std::complex<float>(vals.x, vals.y);
   }
@@ -835,8 +848,7 @@ template <> class NormalRandomGenerator<std::complex<double> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
+  __device__ std::complex<double> operator()() const {
     double2 vals = curand_normal2_double(&m_state);
     return std::complex<double>(vals.x, vals.y);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 9df697e4c..a87777b22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -505,9 +505,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
-          data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-          m_result = data;
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
         }
         Op reducer(m_reducer);
         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
@@ -533,9 +538,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
-          data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-          m_result = data;
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
         }
         Op reducer(m_reducer);
         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
@@ -556,6 +566,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     m_impl.cleanup();
     if (m_result) {
       m_device.deallocate(m_result);
+      m_result = NULL;
     }
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 5e512490c..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,11 +67,21 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
 #endif
 }
 
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
 
 #ifdef EIGEN_HAS_CUDA_FP16
 template <template <typename T> class R>
 __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-#if __CUDA_ARCH__ >= 300
   unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
   unsigned int newval = oldval;
   reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
@@ -87,9 +97,6 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
       return;
     }
   }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
 }
 #endif
 
@@ -130,7 +137,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
       unsigned int block = atomicCAS(semaphore, 0u, 1u);
       if (block == 0) {
         // We're the first block to run, initialize the output value
-        atomicExch(output, reducer.initialize());
+        atomicExchCustom(output, reducer.initialize());
         __threadfence();
         atomicExch(semaphore, 2u);
       }
@@ -263,17 +270,22 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
 
 #endif
 
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct FullReductionLauncher {
   static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on floats and half floats");
+    assert(false && "Should only be called on doubles, floats and half floats");
   }
 };
 
-template <typename Self, typename Op, bool PacketAccess>
-struct FullReductionLauncher<Self, Op, float, PacketAccess> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
     typedef typename Self::CoeffReturnType Scalar;
     const int block_size = 256;
@@ -330,20 +342,22 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
 #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
 #endif
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     // Don't crash when we're called with an input tensor of size 0.
     if (num_coeffs == 0) {
@@ -360,6 +374,7 @@ template <int NumPerThread, typename Self,
 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                          typename Self::CoeffReturnType* output) {
 #if __CUDA_ARCH__ >= 300
+  typedef typename Self::CoeffReturnType Type;
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
@@ -389,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
       const Index col_block = i % input_col_blocks;
       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
 
-      float reduced_val = reducer.initialize();
+      Type reduced_val = reducer.initialize();
 
       for (Index j = 0; j < NumPerThread; j += unroll_times) {
         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
         if (last_col >= num_coeffs_to_reduce) {
           for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
             reducer.reduce(val, &reduced_val);
           }
           break;
@@ -521,17 +536,23 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
 
 #endif
 
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct InnerReductionLauncher {
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats and half floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
     return true;
   }
 };
 
-template <typename Self, typename Op, bool PacketAccess>
-struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@@ -549,7 +570,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
       const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / 1024;
       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
                          num_blocks, 1024, 0, device, reducer.initialize(),
                          num_preserved_vals, output);
     }
@@ -616,15 +637,17 @@ struct InnerReducer<Self, Op, GpuDevice> {
 #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
 #endif
 
   template <typename OutputType>
   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     // Don't crash when we're called with an input tensor of size 0.
     if (num_coeffs == 0) {
@@ -675,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
   // so reduce the scope of the optimized version of the code to the simple case
   // of floats.
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
   template <typename Device, typename OutputType>
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
     return true;
   }
 
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
deleted file mode 100644
index 6e871a8da..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/CMakeLists.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry COMPONENT Devel
-  )
-
-add_subdirectory(util)
diff --git a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
deleted file mode 100644
index dc9fc78ec..000000000
--- a/unsupported/Eigen/CXX11/src/TensorSymmetry/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_TensorSymmetry_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_TensorSymmetry_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/TensorSymmetry/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt b/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
deleted file mode 100644
index 88fef50c6..000000000
--- a/unsupported/Eigen/CXX11/src/ThreadPool/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_ThreadPool_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_ThreadPool_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/ThreadPool COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
index 12b80d6c4..71d55552d 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/EventCount.h
@@ -50,7 +50,7 @@ class EventCount {
  public:
   class Waiter;
 
-  EventCount(std::vector<Waiter>& waiters) : waiters_(waiters) {
+  EventCount(MaxSizeVector<Waiter>& waiters) : waiters_(waiters) {
     eigen_assert(waiters.size() < (1 << kWaiterBits) - 1);
     // Initialize epoch to something close to overflow to test overflow.
     state_ = kStackMask | (kEpochMask - kEpochInc * waiters.size() * 2);
@@ -199,7 +199,7 @@ class EventCount {
   static const uint64_t kEpochMask = ((1ull << kEpochBits) - 1) << kEpochShift;
   static const uint64_t kEpochInc = 1ull << kEpochShift;
   std::atomic<uint64_t> state_;
-  std::vector<Waiter>& waiters_;
+  MaxSizeVector<Waiter>& waiters_;
 
   void Park(Waiter* w) {
     std::unique_lock<std::mutex> lock(w->mu);
diff --git a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
index 33ae45131..354bce52a 100644
--- a/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/ThreadPool/NonBlockingThreadPool.h
@@ -29,6 +29,8 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
         spinning_(0),
         done_(false),
         ec_(waiters_) {
+    waiters_.resize(num_threads);
+
     // Calculate coprimes of num_threads.
     // Coprimes are used for a random walk over all threads in Steal
     // and NonEmptyQueueIndex. Iteration is based on the fact that if we take
@@ -123,7 +125,7 @@ class NonBlockingThreadPoolTempl : public Eigen::ThreadPoolInterface {
   MaxSizeVector<Thread*> threads_;
   MaxSizeVector<Queue*> queues_;
   MaxSizeVector<unsigned> coprimes_;
-  std::vector<EventCount::Waiter> waiters_;
+  MaxSizeVector<EventCount::Waiter> waiters_;
   std::atomic<unsigned> blocked_;
   std::atomic<bool> spinning_;
   std::atomic<bool> done_;
diff --git a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt b/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
deleted file mode 100644
index 7eab492d6..000000000
--- a/unsupported/Eigen/CXX11/src/util/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_util_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_util_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/util COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
index 961456f10..4bc3dd1ba 100644
--- a/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
+++ b/unsupported/Eigen/CXX11/src/util/MaxSizeVector.h
@@ -55,6 +55,17 @@ class MaxSizeVector {
     internal::aligned_free(data_);
   }
 
+  void resize(size_t n) {
+    eigen_assert(n <= reserve_);
+    for (size_t i = size_; i < n; ++i) {
+      new (&data_[i]) T;
+    }
+    for (size_t i = n; i < size_; ++i) {
+      data_[i].~T();
+    }
+    size_ = n;
+  }
+
   // Append new elements (up to reserved size).
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE
   void push_back(const T& t) {
diff --git a/unsupported/Eigen/EulerAngles b/unsupported/Eigen/EulerAngles
new file mode 100644
index 000000000..521fa3f76
--- /dev/null
+++ b/unsupported/Eigen/EulerAngles
@@ -0,0 +1,43 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLES_MODULE_H
+#define EIGEN_EULERANGLES_MODULE_H
+
+
+#include "Eigen/Core"
+#include "Eigen/Geometry"
+
+#include "Eigen/src/Core/util/DisableStupidWarnings.h"
+
+namespace Eigen {
+
+/**
+  * \defgroup EulerAngles_Module EulerAngles module
+  * \brief This module provides generic euler angles rotation.
+  *
+  * Euler angles are a way to represent 3D rotation.
+  *
+  * In order to use this module in your code, include this header:
+  * \code
+  * #include <unsupported/Eigen/EulerAngles>
+  * \endcode
+  *
+  * See \ref EulerAngles for more information.
+  *
+  */
+
+}
+
+#include "src/EulerAngles/EulerSystem.h"
+#include "src/EulerAngles/EulerAngles.h"
+
+#include "Eigen/src/Core/util/ReenableStupidWarnings.h"
+
+#endif // EIGEN_EULERANGLES_MODULE_H
diff --git a/unsupported/Eigen/KroneckerProduct b/unsupported/Eigen/KroneckerProduct
index c932c06a6..5f5afb8cf 100644
--- a/unsupported/Eigen/KroneckerProduct
+++ b/unsupported/Eigen/KroneckerProduct
@@ -13,6 +13,8 @@
 
 #include "../../Eigen/src/Core/util/DisableStupidWarnings.h"
 
+#include "../../Eigen/src/SparseCore/SparseUtil.h"
+
 namespace Eigen {
 
 /**
diff --git a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt b/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
deleted file mode 100644
index ad91fd9c4..000000000
--- a/unsupported/Eigen/src/AutoDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_AutoDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_AutoDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/AutoDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/BVH/CMakeLists.txt b/unsupported/Eigen/src/BVH/CMakeLists.txt
deleted file mode 100644
index b377d865c..000000000
--- a/unsupported/Eigen/src/BVH/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_BVH_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_BVH_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/BVH COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/CMakeLists.txt b/unsupported/Eigen/src/CMakeLists.txt
deleted file mode 100644
index f42946793..000000000
--- a/unsupported/Eigen/src/CMakeLists.txt
+++ /dev/null
@@ -1,16 +0,0 @@
-ADD_SUBDIRECTORY(AutoDiff)
-ADD_SUBDIRECTORY(BVH)
-ADD_SUBDIRECTORY(Eigenvalues)
-ADD_SUBDIRECTORY(FFT)
-ADD_SUBDIRECTORY(IterativeSolvers)
-ADD_SUBDIRECTORY(LevenbergMarquardt)
-ADD_SUBDIRECTORY(MatrixFunctions)
-ADD_SUBDIRECTORY(MoreVectorization)
-ADD_SUBDIRECTORY(NonLinearOptimization)
-ADD_SUBDIRECTORY(NumericalDiff)
-ADD_SUBDIRECTORY(Polynomials)
-ADD_SUBDIRECTORY(Skyline)
-ADD_SUBDIRECTORY(SparseExtra)
-ADD_SUBDIRECTORY(SpecialFunctions)
-ADD_SUBDIRECTORY(KroneckerProduct)
-ADD_SUBDIRECTORY(Splines)
diff --git a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
index 3b6a69aff..866a8a460 100644
--- a/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
+++ b/unsupported/Eigen/src/Eigenvalues/ArpackSelfAdjointEigenSolver.h
@@ -628,15 +628,15 @@ ArpackGeneralizedSelfAdjointEigenSolver<MatrixType, MatrixSolver, BisSPD>&
       m_info = Success;
     }
 
-    delete select;
+    delete[] select;
   }
 
-  delete v;
-  delete iparam;
-  delete ipntr;
-  delete workd;
-  delete workl;
-  delete resid;
+  delete[] v;
+  delete[] iparam;
+  delete[] ipntr;
+  delete[] workd;
+  delete[] workl;
+  delete[] resid;
 
   m_isInitialized = true;
 
diff --git a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt b/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
deleted file mode 100644
index 1d4387c82..000000000
--- a/unsupported/Eigen/src/Eigenvalues/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Eigenvalues_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Eigenvalues_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Eigenvalues COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/EulerAngles/CMakeLists.txt b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
new file mode 100644
index 000000000..40af550e8
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/CMakeLists.txt
@@ -0,0 +1,6 @@
+FILE(GLOB Eigen_EulerAngles_SRCS "*.h")
+
+INSTALL(FILES
+  ${Eigen_EulerAngles_SRCS}
+  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/EulerAngles COMPONENT Devel
+  )
diff --git a/unsupported/Eigen/src/EulerAngles/EulerAngles.h b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
new file mode 100644
index 000000000..13a0da1ab
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerAngles.h
@@ -0,0 +1,386 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERANGLESCLASS_H// TODO: Fix previous "EIGEN_EULERANGLES_H" definition?
+#define EIGEN_EULERANGLESCLASS_H
+
+namespace Eigen
+{
+  /*template<typename Other,
+         int OtherRows=Other::RowsAtCompileTime,
+         int OtherCols=Other::ColsAtCompileTime>
+  struct ei_eulerangles_assign_impl;*/
+
+  /** \class EulerAngles
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a rotation in a 3 dimensional space as three Euler angles.
+    *
+    * Euler rotation is a set of three rotation of three angles over three fixed axes, defined by the EulerSystem given as a template parameter.
+    * 
+    * Here is how intrinsic Euler angles works:
+    *  - first, rotate the axes system over the alpha axis in angle alpha
+    *  - then, rotate the axes system over the beta axis(which was rotated in the first stage) in angle beta
+    *  - then, rotate the axes system over the gamma axis(which was rotated in the two stages above) in angle gamma
+    *
+    * \note This class support only intrinsic Euler angles for simplicity,
+    *  see EulerSystem how to easily overcome this for extrinsic systems.
+    *
+    * ### Rotation representation and conversions ###
+    *
+    * It has been proved(see Wikipedia link below) that every rotation can be represented
+    *  by Euler angles, but there is no singular representation (e.g. unlike rotation matrices).
+    * Therefore, you can convert from Eigen rotation and to them
+    *  (including rotation matrices, which is not called "rotations" by Eigen design).
+    *
+    * Euler angles usually used for:
+    *  - convenient human representation of rotation, especially in interactive GUI.
+    *  - gimbal systems and robotics
+    *  - efficient encoding(i.e. 3 floats only) of rotation for network protocols.
+    *
+    * However, Euler angles are slow comparing to quaternion or matrices,
+    *  because their unnatural math definition, although it's simple for human.
+    * To overcome this, this class provide easy movement from the math friendly representation
+    *  to the human friendly representation, and vise-versa.
+    *
+    * All the user need to do is a safe simple C++ type conversion,
+    *  and this class take care for the math.
+    * Additionally, some axes related computation is done in compile time.
+    *
+    * #### Euler angles ranges in conversions ####
+    *
+    * When converting some rotation to Euler angles, there are some ways you can guarantee
+    *  the Euler angles ranges.
+    *
+    * #### implicit ranges ####
+    * When using implicit ranges, all angles are guarantee to be in the range [-PI, +PI],
+    *  unless you convert from some other Euler angles.
+    * In this case, the range is __undefined__ (might be even less than -PI or greater than +2*PI).
+    * \sa EulerAngles(const MatrixBase<Derived>&)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&)
+    *
+    * #### explicit ranges ####
+    * When using explicit ranges, all angles are guarantee to be in the range you choose.
+    * In the range Boolean parameter, you're been ask whether you prefer the positive range or not:
+    * - _true_ - force the range between [0, +2*PI]
+    * - _false_ - force the range between [-PI, +PI]
+    *
+    * ##### compile time ranges #####
+    * This is when you have compile time ranges and you prefer to
+    *  use template parameter. (e.g. for performance)
+    * \sa FromRotation()
+    *
+    * ##### run-time time ranges #####
+    * Run-time ranges are also supported.
+    * \sa EulerAngles(const MatrixBase<Derived>&, bool, bool, bool)
+    * \sa EulerAngles(const RotationBase<Derived, 3>&, bool, bool, bool)
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerAngles exist for float and double scalar,
+    *  in a form of EulerAngles{A}{B}{C}{scalar},
+    *  e.g. \ref EulerAnglesXYZd, \ref EulerAnglesZYZf.
+    *
+    * Only for positive axes{+x,+y,+z} Euler systems are have convenient typedef.
+    * If you need negative axes{-x,-y,-z}, it is recommended to create you own typedef with
+    *  a word that represent what you need.
+    *
+    * ### Example ###
+    *
+    * \include EulerAngles.cpp
+    * Output: \verbinclude EulerAngles.out
+    *
+    * ### Additional reading ###
+    *
+    * If you're want to get more idea about how Euler system work in Eigen see EulerSystem.
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _Scalar the scalar type, i.e., the type of the angles.
+    *
+    * \tparam _System the EulerSystem to use, which represents the axes of rotation.
+    */
+  template <typename _Scalar, class _System>
+  class EulerAngles : public RotationBase<EulerAngles<_Scalar, _System>, 3>
+  {
+    public:
+      /** the scalar type of the angles */
+      typedef _Scalar Scalar;
+      
+      /** the EulerSystem to use, which represents the axes of rotation. */
+      typedef _System System;
+    
+      typedef Matrix<Scalar,3,3> Matrix3; /*!< the equivalent rotation matrix type */
+      typedef Matrix<Scalar,3,1> Vector3; /*!< the equivalent 3 dimension vector type */
+      typedef Quaternion<Scalar> QuaternionType; /*!< the equivalent quaternion type */
+      typedef AngleAxis<Scalar> AngleAxisType; /*!< the equivalent angle-axis type */
+      
+      /** \returns the axis vector of the first (alpha) rotation */
+      static Vector3 AlphaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::AlphaAxisAbs - 1);
+        return System::IsAlphaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the second (beta) rotation */
+      static Vector3 BetaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::BetaAxisAbs - 1);
+        return System::IsBetaOpposite ? -u : u;
+      }
+      
+      /** \returns the axis vector of the third (gamma) rotation */
+      static Vector3 GammaAxisVector() {
+        const Vector3& u = Vector3::Unit(System::GammaAxisAbs - 1);
+        return System::IsGammaOpposite ? -u : u;
+      }
+
+    private:
+      Vector3 m_angles;
+
+    public:
+      /** Default constructor without initialization. */
+      EulerAngles() {}
+      /** Constructs and initialize Euler angles(\p alpha, \p beta, \p gamma). */
+      EulerAngles(const Scalar& alpha, const Scalar& beta, const Scalar& gamma) :
+        m_angles(alpha, beta, gamma) {}
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m.
+        *
+        * \note All angles will be in the range [-PI, PI].
+      */
+      template<typename Derived>
+      EulerAngles(const MatrixBase<Derived>& m) { *this = m; }
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const MatrixBase<Derived>& m,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot.
+        *
+        * \note All angles will be in the range [-PI, PI], unless \p rot is an EulerAngles.
+        *  If rot is an EulerAngles, expected EulerAngles range is __undefined__.
+        *  (Use other functions here for enforcing range if this effect is desired)
+      */
+      template<typename Derived>
+      EulerAngles(const RotationBase<Derived, 3>& rot) { *this = rot; }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range.
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \param positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \param positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<typename Derived>
+      EulerAngles(
+        const RotationBase<Derived, 3>& rot,
+        bool positiveRangeAlpha,
+        bool positiveRangeBeta,
+        bool positiveRangeGamma) {
+        
+        System::CalcEulerAngles(*this, rot.toRotationMatrix(), positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma);
+      }
+
+      /** \returns The angle values stored in a vector (alpha, beta, gamma). */
+      const Vector3& angles() const { return m_angles; }
+      /** \returns A read-write reference to the angle values stored in a vector (alpha, beta, gamma). */
+      Vector3& angles() { return m_angles; }
+
+      /** \returns The value of the first angle. */
+      Scalar alpha() const { return m_angles[0]; }
+      /** \returns A read-write reference to the angle of the first angle. */
+      Scalar& alpha() { return m_angles[0]; }
+
+      /** \returns The value of the second angle. */
+      Scalar beta() const { return m_angles[1]; }
+      /** \returns A read-write reference to the angle of the second angle. */
+      Scalar& beta() { return m_angles[1]; }
+
+      /** \returns The value of the third angle. */
+      Scalar gamma() const { return m_angles[2]; }
+      /** \returns A read-write reference to the angle of the third angle. */
+      Scalar& gamma() { return m_angles[2]; }
+
+      /** \returns The Euler angles rotation inverse (which is as same as the negative),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles inverse() const
+      {
+        EulerAngles res;
+        res.m_angles = -m_angles;
+        return res;
+      }
+
+      /** \returns The Euler angles rotation negative (which is as same as the inverse),
+        *  (-alpha, -beta, -gamma).
+      */
+      EulerAngles operator -() const
+      {
+        return inverse();
+      }
+      
+      /** Constructs and initialize Euler angles from a 3x3 rotation matrix \p m,
+        *  with options to choose for each angle the requested range (__only in compile time__).
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param m The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const MatrixBase<Derived>& m)
+      {
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        
+        EulerAngles e;
+        System::template CalcEulerAngles<
+          PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma, _Scalar>(e, m);
+        return e;
+      }
+      
+      /** Constructs and initialize Euler angles from a rotation \p rot,
+        *  with options to choose for each angle the requested range (__only in compile time__).
+        *
+        * If positive range is true, then the specified angle will be in the range [0, +2*PI].
+        * Otherwise, the specified angle will be in the range [-PI, +PI].
+        *
+        * \param rot The 3x3 rotation matrix to convert
+        * \tparam positiveRangeAlpha If true, alpha will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeBeta If true, beta will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+        * \tparam positiveRangeGamma If true, gamma will be in [0, 2*PI]. Otherwise, in [-PI, +PI].
+      */
+      template<
+        bool PositiveRangeAlpha,
+        bool PositiveRangeBeta,
+        bool PositiveRangeGamma,
+        typename Derived>
+      static EulerAngles FromRotation(const RotationBase<Derived, 3>& rot)
+      {
+        return FromRotation<PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma>(rot.toRotationMatrix());
+      }
+      
+      /*EulerAngles& fromQuaternion(const QuaternionType& q)
+      {
+        // TODO: Implement it in a faster way for quaternions
+        // According to http://www.euclideanspace.com/maths/geometry/rotations/conversions/quaternionToEuler/
+        //  we can compute only the needed matrix cells and then convert to euler angles. (see ZYX example below)
+        // Currently we compute all matrix cells from quaternion.
+
+        // Special case only for ZYX
+        //Scalar y2 = q.y() * q.y();
+        //m_angles[0] = std::atan2(2*(q.w()*q.z() + q.x()*q.y()), (1 - 2*(y2 + q.z()*q.z())));
+        //m_angles[1] = std::asin( 2*(q.w()*q.y() - q.z()*q.x()));
+        //m_angles[2] = std::atan2(2*(q.w()*q.x() + q.y()*q.z()), (1 - 2*(q.x()*q.x() + y2)));
+      }*/
+      
+      /** Set \c *this from a rotation matrix(i.e. pure orthogonal matrix with determinant of +1). */
+      template<typename Derived>
+      EulerAngles& operator=(const MatrixBase<Derived>& m) {
+        EIGEN_STATIC_ASSERT_MATRIX_SPECIFIC_SIZE(Derived, 3, 3)
+        
+        System::CalcEulerAngles(*this, m);
+        return *this;
+      }
+
+      // TODO: Assign and construct from another EulerAngles (with different system)
+      
+      /** Set \c *this from a rotation. */
+      template<typename Derived>
+      EulerAngles& operator=(const RotationBase<Derived, 3>& rot) {
+        System::CalcEulerAngles(*this, rot.toRotationMatrix());
+        return *this;
+      }
+      
+      // TODO: Support isApprox function
+
+      /** \returns an equivalent 3x3 rotation matrix. */
+      Matrix3 toRotationMatrix() const
+      {
+        return static_cast<QuaternionType>(*this).toRotationMatrix();
+      }
+
+      /** Convert the Euler angles to quaternion. */
+      operator QuaternionType() const
+      {
+        return
+          AngleAxisType(alpha(), AlphaAxisVector()) *
+          AngleAxisType(beta(), BetaAxisVector())   *
+          AngleAxisType(gamma(), GammaAxisVector());
+      }
+      
+      friend std::ostream& operator<<(std::ostream& s, const EulerAngles<Scalar, System>& eulerAngles)
+      {
+        s << eulerAngles.angles().transpose();
+        return s;
+      }
+  };
+
+#define EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(AXES, SCALAR_TYPE, SCALAR_POSTFIX) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerAngles<SCALAR_TYPE, EulerSystem##AXES> EulerAngles##AXES##SCALAR_POSTFIX;
+
+#define EIGEN_EULER_ANGLES_TYPEDEFS(SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(XZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YZY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(YXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+ \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXY, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZXZ, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYX, SCALAR_TYPE, SCALAR_POSTFIX) \
+  EIGEN_EULER_ANGLES_SINGLE_TYPEDEF(ZYZ, SCALAR_TYPE, SCALAR_POSTFIX)
+
+EIGEN_EULER_ANGLES_TYPEDEFS(float, f)
+EIGEN_EULER_ANGLES_TYPEDEFS(double, d)
+
+  namespace internal
+  {
+    template<typename _Scalar, class _System>
+    struct traits<EulerAngles<_Scalar, _System> >
+    {
+      typedef _Scalar Scalar;
+    };
+  }
+  
+}
+
+#endif // EIGEN_EULERANGLESCLASS_H
diff --git a/unsupported/Eigen/src/EulerAngles/EulerSystem.h b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
new file mode 100644
index 000000000..82243e643
--- /dev/null
+++ b/unsupported/Eigen/src/EulerAngles/EulerSystem.h
@@ -0,0 +1,316 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#ifndef EIGEN_EULERSYSTEM_H
+#define EIGEN_EULERSYSTEM_H
+
+namespace Eigen
+{
+  // Forward declerations
+  template <typename _Scalar, class _System>
+  class EulerAngles;
+  
+  namespace internal
+  {
+    // TODO: Check if already exists on the rest API
+    template <int Num, bool IsPositive = (Num > 0)>
+    struct Abs
+    {
+      enum { value = Num };
+    };
+  
+    template <int Num>
+    struct Abs<Num, false>
+    {
+      enum { value = -Num };
+    };
+
+    template <int Axis>
+    struct IsValidAxis
+    {
+      enum { value = Axis != 0 && Abs<Axis>::value <= 3 };
+    };
+  }
+  
+  #define EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(COND,MSG) typedef char static_assertion_##MSG[(COND)?1:-1]
+  
+  /** \brief Representation of a fixed signed rotation axis for EulerSystem.
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * Values here represent:
+    *  - The axis of the rotation: X, Y or Z.
+    *  - The sign (i.e. direction of the rotation along the axis): positive(+) or negative(-)
+    *
+    * Therefore, this could express all the axes {+X,+Y,+Z,-X,-Y,-Z}
+    *
+    * For positive axis, use +EULER_{axis}, and for negative axis use -EULER_{axis}.
+    */
+  enum EulerAxis
+  {
+    EULER_X = 1, /*!< the X axis */
+    EULER_Y = 2, /*!< the Y axis */
+    EULER_Z = 3  /*!< the Z axis */
+  };
+  
+  /** \class EulerSystem
+    *
+    * \ingroup EulerAngles_Module
+    *
+    * \brief Represents a fixed Euler rotation system.
+    *
+    * This meta-class goal is to represent the Euler system in compilation time, for EulerAngles.
+    *
+    * You can use this class to get two things:
+    *  - Build an Euler system, and then pass it as a template parameter to EulerAngles.
+    *  - Query some compile time data about an Euler system. (e.g. Whether it's tait bryan)
+    *
+    * Euler rotation is a set of three rotation on fixed axes. (see \ref EulerAngles)
+    * This meta-class store constantly those signed axes. (see \ref EulerAxis)
+    *
+    * ### Types of Euler systems ###
+    *
+    * All and only valid 3 dimension Euler rotation over standard
+    *  signed axes{+X,+Y,+Z,-X,-Y,-Z} are supported:
+    *  - all axes X, Y, Z in each valid order (see below what order is valid)
+    *  - rotation over the axis is supported both over the positive and negative directions.
+    *  - both tait bryan and proper/classic Euler angles (i.e. the opposite).
+    *
+    * Since EulerSystem support both positive and negative directions,
+    *  you may call this rotation distinction in other names:
+    *  - _right handed_ or _left handed_
+    *  - _counterclockwise_ or _clockwise_
+    *
+    * Notice all axed combination are valid, and would trigger a static assertion.
+    * Same unsigned axes can't be neighbors, e.g. {X,X,Y} is invalid.
+    * This yield two and only two classes:
+    *  - _tait bryan_ - all unsigned axes are distinct, e.g. {X,Y,Z}
+    *  - _proper/classic Euler angles_ - The first and the third unsigned axes is equal,
+    *     and the second is different, e.g. {X,Y,X}
+    *
+    * ### Intrinsic vs extrinsic Euler systems ###
+    *
+    * Only intrinsic Euler systems are supported for simplicity.
+    *  If you want to use extrinsic Euler systems,
+    *   just use the equal intrinsic opposite order for axes and angles.
+    *  I.e axes (A,B,C) becomes (C,B,A), and angles (a,b,c) becomes (c,b,a).
+    *
+    * ### Convenient user typedefs ###
+    *
+    * Convenient typedefs for EulerSystem exist (only for positive axes Euler systems),
+    *  in a form of EulerSystem{A}{B}{C}, e.g. \ref EulerSystemXYZ.
+    *
+    * ### Additional reading ###
+    *
+    * More information about Euler angles: https://en.wikipedia.org/wiki/Euler_angles
+    *
+    * \tparam _AlphaAxis the first fixed EulerAxis
+    *
+    * \tparam _AlphaAxis the second fixed EulerAxis
+    *
+    * \tparam _AlphaAxis the third fixed EulerAxis
+    */
+  template <int _AlphaAxis, int _BetaAxis, int _GammaAxis>
+  class EulerSystem
+  {
+    public:
+    // It's defined this way and not as enum, because I think
+    //  that enum is not guerantee to support negative numbers
+    
+    /** The first rotation axis */
+    static const int AlphaAxis = _AlphaAxis;
+    
+    /** The second rotation axis */
+    static const int BetaAxis = _BetaAxis;
+    
+    /** The third rotation axis */
+    static const int GammaAxis = _GammaAxis;
+
+    enum
+    {
+      AlphaAxisAbs = internal::Abs<AlphaAxis>::value, /*!< the first rotation axis unsigned */
+      BetaAxisAbs = internal::Abs<BetaAxis>::value, /*!< the second rotation axis unsigned */
+      GammaAxisAbs = internal::Abs<GammaAxis>::value, /*!< the third rotation axis unsigned */
+      
+      IsAlphaOpposite = (AlphaAxis < 0) ? 1 : 0, /*!< weather alpha axis is negative */
+      IsBetaOpposite = (BetaAxis < 0) ? 1 : 0, /*!< weather beta axis is negative */
+      IsGammaOpposite = (GammaAxis < 0) ? 1 : 0, /*!< weather gamma axis is negative */
+      
+      IsOdd = ((AlphaAxisAbs)%3 == (BetaAxisAbs - 1)%3) ? 0 : 1, /*!< weather the Euler system is odd */
+      IsEven = IsOdd ? 0 : 1, /*!< weather the Euler system is even */
+
+      IsTaitBryan = ((unsigned)AlphaAxisAbs != (unsigned)GammaAxisAbs) ? 1 : 0 /*!< weather the Euler system is tait bryan */
+    };
+    
+    private:
+    
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<AlphaAxis>::value,
+      ALPHA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<BetaAxis>::value,
+      BETA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT(internal::IsValidAxis<GammaAxis>::value,
+      GAMMA_AXIS_IS_INVALID);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)AlphaAxisAbs != (unsigned)BetaAxisAbs,
+      ALPHA_AXIS_CANT_BE_EQUAL_TO_BETA_AXIS);
+      
+    EIGEN_EULER_ANGLES_CLASS_STATIC_ASSERT((unsigned)BetaAxisAbs != (unsigned)GammaAxisAbs,
+      BETA_AXIS_CANT_BE_EQUAL_TO_GAMMA_AXIS);
+
+    enum
+    {
+      // I, J, K are the pivot indexes permutation for the rotation matrix, that match this Euler system. 
+      // They are used in this class converters.
+      // They are always different from each other, and their possible values are: 0, 1, or 2.
+      I = AlphaAxisAbs - 1,
+      J = (AlphaAxisAbs - 1 + 1 + IsOdd)%3,
+      K = (AlphaAxisAbs - 1 + 2 - IsOdd)%3
+    };
+    
+    // TODO: Get @mat parameter in form that avoids double evaluation.
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar, 3, 1>& res, const MatrixBase<Derived>& mat, internal::true_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sin;
+      using std::cos;
+      
+      typedef typename Derived::Scalar Scalar;
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,K), mat(K,K));
+      Scalar c2 = Vector2(mat(I,I), mat(I,J)).norm();
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0))) {
+        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        res[1] = atan2(-mat(I,K), -c2);
+      }
+      else
+        res[1] = atan2(-mat(I,K), c2);
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(s1*mat(K,I)-c1*mat(J,I), c1*mat(J,J) - s1 * mat(K,J));
+    }
+
+    template <typename Derived>
+    static void CalcEulerAngles_imp(Matrix<typename MatrixBase<Derived>::Scalar,3,1>& res, const MatrixBase<Derived>& mat, internal::false_type /*isTaitBryan*/)
+    {
+      using std::atan2;
+      using std::sin;
+      using std::cos;
+
+      typedef typename Derived::Scalar Scalar;
+      typedef Matrix<Scalar,2,1> Vector2;
+      
+      res[0] = atan2(mat(J,I), mat(K,I));
+      if((IsOdd && res[0]<Scalar(0)) || ((!IsOdd) && res[0]>Scalar(0)))
+      {
+        res[0] = (res[0] > Scalar(0)) ? res[0] - Scalar(EIGEN_PI) : res[0] + Scalar(EIGEN_PI);
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = -atan2(s2, mat(I,I));
+      }
+      else
+      {
+        Scalar s2 = Vector2(mat(J,I), mat(K,I)).norm();
+        res[1] = atan2(s2, mat(I,I));
+      }
+
+      // With a=(0,1,0), we have i=0; j=1; k=2, and after computing the first two angles,
+      // we can compute their respective rotation, and apply its inverse to M. Since the result must
+      // be a rotation around x, we have:
+      //
+      //  c2  s1.s2 c1.s2                   1  0   0 
+      //  0   c1    -s1       *    M    =   0  c3  s3
+      //  -s2 s1.c2 c1.c2                   0 -s3  c3
+      //
+      //  Thus:  m11.c1 - m21.s1 = c3  &   m12.c1 - m22.s1 = s3
+
+      Scalar s1 = sin(res[0]);
+      Scalar c1 = cos(res[0]);
+      res[2] = atan2(c1*mat(J,K)-s1*mat(K,K), c1*mat(J,J) - s1 * mat(K,J));
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, false, false, false);
+    }
+    
+    template<
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma,
+      typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat)
+    {
+      CalcEulerAngles(res, mat, PositiveRangeAlpha, PositiveRangeBeta, PositiveRangeGamma);
+    }
+    
+    template<typename Scalar>
+    static void CalcEulerAngles(
+      EulerAngles<Scalar, EulerSystem>& res,
+      const typename EulerAngles<Scalar, EulerSystem>::Matrix3& mat,
+      bool PositiveRangeAlpha,
+      bool PositiveRangeBeta,
+      bool PositiveRangeGamma)
+    {
+      CalcEulerAngles_imp(
+        res.angles(), mat,
+        typename internal::conditional<IsTaitBryan, internal::true_type, internal::false_type>::type());
+
+      if (IsAlphaOpposite == IsOdd)
+        res.alpha() = -res.alpha();
+        
+      if (IsBetaOpposite == IsOdd)
+        res.beta() = -res.beta();
+        
+      if (IsGammaOpposite == IsOdd)
+        res.gamma() = -res.gamma();
+      
+      // Saturate results to the requested range
+      if (PositiveRangeAlpha && (res.alpha() < 0))
+        res.alpha() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeBeta && (res.beta() < 0))
+        res.beta() += Scalar(2 * EIGEN_PI);
+      
+      if (PositiveRangeGamma && (res.gamma() < 0))
+        res.gamma() += Scalar(2 * EIGEN_PI);
+    }
+    
+    template <typename _Scalar, class _System>
+    friend class Eigen::EulerAngles;
+  };
+
+#define EIGEN_EULER_SYSTEM_TYPEDEF(A, B, C) \
+  /** \ingroup EulerAngles_Module */ \
+  typedef EulerSystem<EULER_##A, EULER_##B, EULER_##C> EulerSystem##A##B##C;
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(X,Z,X)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,Z,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Y,X,Y)
+  
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Y)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,X,Z)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,X)
+  EIGEN_EULER_SYSTEM_TYPEDEF(Z,Y,Z)
+}
+
+#endif // EIGEN_EULERSYSTEM_H
diff --git a/unsupported/Eigen/src/FFT/CMakeLists.txt b/unsupported/Eigen/src/FFT/CMakeLists.txt
deleted file mode 100644
index edcffcb18..000000000
--- a/unsupported/Eigen/src/FFT/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_FFT_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_FFT_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/FFT COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt b/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
deleted file mode 100644
index 7986afc5e..000000000
--- a/unsupported/Eigen/src/IterativeSolvers/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_IterativeSolvers_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_IterativeSolvers_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/IterativeSolvers COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt b/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
deleted file mode 100644
index 4daefebee..000000000
--- a/unsupported/Eigen/src/KroneckerProduct/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_KroneckerProduct_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_KroneckerProduct_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/KroneckerProduct COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt b/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
deleted file mode 100644
index d9690854d..000000000
--- a/unsupported/Eigen/src/LevenbergMarquardt/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_LevenbergMarquardt_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_LevenbergMarquardt_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/LevenbergMarquardt COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt b/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
deleted file mode 100644
index cdde64d2c..000000000
--- a/unsupported/Eigen/src/MatrixFunctions/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MatrixFunctions_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MatrixFunctions_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MatrixFunctions COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt b/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
deleted file mode 100644
index 1b887cc8e..000000000
--- a/unsupported/Eigen/src/MoreVectorization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_MoreVectorization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_MoreVectorization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/MoreVectorization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt b/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
deleted file mode 100644
index 9322ddadf..000000000
--- a/unsupported/Eigen/src/NonLinearOptimization/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NonLinearOptimization_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NonLinearOptimization_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NonLinearOptimization COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt b/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
deleted file mode 100644
index 1199aca2f..000000000
--- a/unsupported/Eigen/src/NumericalDiff/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_NumericalDiff_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_NumericalDiff_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/NumericalDiff COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Polynomials/CMakeLists.txt b/unsupported/Eigen/src/Polynomials/CMakeLists.txt
deleted file mode 100644
index 51f13f3cb..000000000
--- a/unsupported/Eigen/src/Polynomials/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Polynomials_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Polynomials_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Polynomials COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/Skyline/CMakeLists.txt b/unsupported/Eigen/src/Skyline/CMakeLists.txt
deleted file mode 100644
index 3bf1b0dd4..000000000
--- a/unsupported/Eigen/src/Skyline/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Skyline_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Skyline_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Skyline COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt b/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
deleted file mode 100644
index 7ea32ca5e..000000000
--- a/unsupported/Eigen/src/SparseExtra/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_SparseExtra_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_SparseExtra_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SparseExtra COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/src/SpecialFunctions/CMakeLists.txt b/unsupported/Eigen/src/SpecialFunctions/CMakeLists.txt
deleted file mode 100644
index 25df9439d..000000000
--- a/unsupported/Eigen/src/SpecialFunctions/CMakeLists.txt
+++ /dev/null
@@ -1,11 +0,0 @@
-FILE(GLOB Eigen_SpecialFunctions_SRCS "*.h")
-INSTALL(FILES
-  ${Eigen_SpecialFunctions_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SpecialFunctions COMPONENT Devel
-  )
-
-FILE(GLOB Eigen_SpecialFunctions_arch_CUDA_SRCS "arch/CUDA/*.h")
-INSTALL(FILES
-  ${Eigen_SpecialFunctions_arch_CUDA_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/SpecialFunctions/arch/CUDA COMPONENT Devel
-  )
-\ No newline at end of file
diff --git a/unsupported/Eigen/src/Splines/CMakeLists.txt b/unsupported/Eigen/src/Splines/CMakeLists.txt
deleted file mode 100644
index 55c6271e9..000000000
--- a/unsupported/Eigen/src/Splines/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_Splines_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_Splines_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/src/Splines COMPONENT Devel
-  )
diff --git a/unsupported/doc/examples/EulerAngles.cpp b/unsupported/doc/examples/EulerAngles.cpp
new file mode 100644
index 000000000..1ef6aee18
--- /dev/null
+++ b/unsupported/doc/examples/EulerAngles.cpp
@@ -0,0 +1,46 @@
+#include <unsupported/Eigen/EulerAngles>
+#include <iostream>
+
+using namespace Eigen;
+
+int main()
+{
+  // A common Euler system by many armies around the world,
+  //  where the first one is the azimuth(the angle from the north -
+  //   the same angle that is show in compass)
+  //  and the second one is elevation(the angle from the horizon)
+  //  and the third one is roll(the angle between the horizontal body
+  //   direction and the plane ground surface)
+  // Keep remembering we're using radian angles here!
+  typedef EulerSystem<-EULER_Z, EULER_Y, EULER_X> MyArmySystem;
+  typedef EulerAngles<double, MyArmySystem> MyArmyAngles;
+  
+  MyArmyAngles vehicleAngles(
+    3.14/*PI*/ / 2, /* heading to east, notice that this angle is counter-clockwise */
+    -0.3, /* going down from a mountain */
+    0.1); /* slightly rolled to the right */
+  
+  // Some Euler angles representation that our plane use.
+  EulerAnglesZYZd planeAngles(0.78474, 0.5271, -0.513794);
+  
+  MyArmyAngles planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeAngles);
+  
+  std::cout << "vehicle angles(MyArmy):     " << vehicleAngles << std::endl;
+  std::cout << "plane angles(ZYZ):        " << planeAngles << std::endl;
+  std::cout << "plane angles(MyArmy):     " << planeAnglesInMyArmyAngles << std::endl;
+  
+  // Now lets rotate the plane a little bit
+  std::cout << "==========================================================\n";
+  std::cout << "rotating plane now!\n";
+  std::cout << "==========================================================\n";
+  
+  Quaterniond planeRotated = AngleAxisd(-0.342, Vector3d::UnitY()) * planeAngles;
+  
+  planeAngles = planeRotated;
+  planeAnglesInMyArmyAngles = MyArmyAngles::FromRotation<true, false, false>(planeRotated);
+  
+  std::cout << "new plane angles(ZYZ):     " << planeAngles << std::endl;
+  std::cout << "new plane angles(MyArmy): " << planeAnglesInMyArmyAngles << std::endl;
+  
+  return 0;
+}
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt
index de9b5243a..0d7ed1db2 100644
--- a/unsupported/test/CMakeLists.txt
+++ b/unsupported/test/CMakeLists.txt
@@ -59,6 +59,8 @@ ei_add_test(alignedvector3)
 
 ei_add_test(FFT)
 
+ei_add_test(EulerAngles)
+
 find_package(MPFR 2.3.0)
 find_package(GMP)
 if(MPFR_FOUND AND EIGEN_COMPILER_SUPPORT_CXX11)
@@ -230,20 +232,25 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA)
   cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include")
   set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
 
-  ei_add_test(cxx11_tensor_device)
-  ei_add_test(cxx11_tensor_cuda)
-  ei_add_test(cxx11_tensor_contract_cuda)
+  ei_add_test(cxx11_tensor_complex_cuda)
   ei_add_test(cxx11_tensor_reduction_cuda)
   ei_add_test(cxx11_tensor_argmax_cuda)
   ei_add_test(cxx11_tensor_cast_float16_cuda)
   ei_add_test(cxx11_tensor_scan_cuda)
 
+  # Contractions require arch 3.0 or higher
+  if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29)
+    ei_add_test(cxx11_tensor_device)
+    ei_add_test(cxx11_tensor_cuda)
+    ei_add_test(cxx11_tensor_contract_cuda)
+    ei_add_test(cxx11_tensor_of_float16_cuda)
+  endif()
+
   # The random number generation code requires arch 3.5 or greater.
   if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34)
     ei_add_test(cxx11_tensor_random_cuda)
   endif()
 
-  ei_add_test(cxx11_tensor_of_float16_cuda)
 
   unset(EIGEN_ADD_TEST_FILENAME_EXTENSION)
 endif()
diff --git a/unsupported/test/EulerAngles.cpp b/unsupported/test/EulerAngles.cpp
new file mode 100644
index 000000000..a8cb52864
--- /dev/null
+++ b/unsupported/test/EulerAngles.cpp
@@ -0,0 +1,208 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2015 Tal Hadad <tal_hd@hotmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#include "main.h"
+
+#include <unsupported/Eigen/EulerAngles>
+
+using namespace Eigen;
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler_ranged(const Matrix<Scalar,3,1>& ea,
+  bool positiveRangeAlpha, bool positiveRangeBeta, bool positiveRangeGamma)
+{
+  typedef EulerAngles<Scalar, EulerSystem> EulerAnglesType;
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Quaternion<Scalar> QuaternionType;
+  typedef AngleAxis<Scalar> AngleAxisType;
+  using std::abs;
+  
+  Scalar alphaRangeStart, alphaRangeEnd;
+  Scalar betaRangeStart, betaRangeEnd;
+  Scalar gammaRangeStart, gammaRangeEnd;
+  
+  if (positiveRangeAlpha)
+  {
+    alphaRangeStart = Scalar(0);
+    alphaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    alphaRangeStart = -Scalar(EIGEN_PI);
+    alphaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeBeta)
+  {
+    betaRangeStart = Scalar(0);
+    betaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    betaRangeStart = -Scalar(EIGEN_PI);
+    betaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  if (positiveRangeGamma)
+  {
+    gammaRangeStart = Scalar(0);
+    gammaRangeEnd = Scalar(2 * EIGEN_PI);
+  }
+  else
+  {
+    gammaRangeStart = -Scalar(EIGEN_PI);
+    gammaRangeEnd = Scalar(EIGEN_PI);
+  }
+  
+  const int i = EulerSystem::AlphaAxisAbs - 1;
+  const int j = EulerSystem::BetaAxisAbs - 1;
+  const int k = EulerSystem::GammaAxisAbs - 1;
+  
+  const int iFactor = EulerSystem::IsAlphaOpposite ? -1 : 1;
+  const int jFactor = EulerSystem::IsBetaOpposite ? -1 : 1;
+  const int kFactor = EulerSystem::IsGammaOpposite ? -1 : 1;
+  
+  const Vector3 I = EulerAnglesType::AlphaAxisVector();
+  const Vector3 J = EulerAnglesType::BetaAxisVector();
+  const Vector3 K = EulerAnglesType::GammaAxisVector();
+  
+  EulerAnglesType e(ea[0], ea[1], ea[2]);
+  
+  Matrix3 m(e);
+  Vector3 eabis = EulerAnglesType(m, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  
+  // Check that eabis in range
+  VERIFY(alphaRangeStart <= eabis[0] && eabis[0] <= alphaRangeEnd);
+  VERIFY(betaRangeStart <= eabis[1] && eabis[1] <= betaRangeEnd);
+  VERIFY(gammaRangeStart <= eabis[2] && eabis[2] <= gammaRangeEnd);
+  
+  Vector3 eabis2 = m.eulerAngles(i, j, k);
+  
+  // Invert the relevant axes
+  eabis2[0] *= iFactor;
+  eabis2[1] *= jFactor;
+  eabis2[2] *= kFactor;
+  
+  // Saturate the angles to the correct range
+  if (positiveRangeAlpha && (eabis2[0] < 0))
+    eabis2[0] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeBeta && (eabis2[1] < 0))
+    eabis2[1] += Scalar(2 * EIGEN_PI);
+  if (positiveRangeGamma && (eabis2[2] < 0))
+    eabis2[2] += Scalar(2 * EIGEN_PI);
+  
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that our estimation is the same as m.eulerAngles() is
+  
+  Matrix3 mbis(AngleAxisType(eabis[0], I) * AngleAxisType(eabis[1], J) * AngleAxisType(eabis[2], K));
+  VERIFY_IS_APPROX(m,  mbis);
+  
+  // Tests that are only relevant for no possitive range
+  if (!(positiveRangeAlpha || positiveRangeBeta || positiveRangeGamma))
+  {
+    /* If I==K, and ea[1]==0, then there no unique solution. */ 
+    /* The remark apply in the case where I!=K, and |ea[1]| is close to pi/2. */ 
+    if( (i!=k || ea[1]!=0) && (i==k || !internal::isApprox(abs(ea[1]),Scalar(EIGEN_PI/2),test_precision<Scalar>())) ) 
+      VERIFY((ea-eabis).norm() <= test_precision<Scalar>());
+    
+    // approx_or_less_than does not work for 0
+    VERIFY(0 < eabis[0] || test_isMuchSmallerThan(eabis[0], Scalar(1)));
+  }
+  
+  // Quaternions
+  QuaternionType q(e);
+  eabis = EulerAnglesType(q, positiveRangeAlpha, positiveRangeBeta, positiveRangeGamma).angles();
+  VERIFY_IS_APPROX(eabis, eabis2);// Verify that the euler angles are still the same
+}
+
+template<typename EulerSystem, typename Scalar>
+void verify_euler(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler_ranged<EulerSystem>(ea, false, false, false);
+  verify_euler_ranged<EulerSystem>(ea, false, false, true);
+  verify_euler_ranged<EulerSystem>(ea, false, true, false);
+  verify_euler_ranged<EulerSystem>(ea, false, true, true);
+  verify_euler_ranged<EulerSystem>(ea, true, false, false);
+  verify_euler_ranged<EulerSystem>(ea, true, false, true);
+  verify_euler_ranged<EulerSystem>(ea, true, true, false);
+  verify_euler_ranged<EulerSystem>(ea, true, true, true);
+}
+
+template<typename Scalar> void check_all_var(const Matrix<Scalar,3,1>& ea)
+{
+  verify_euler<EulerSystemXYZ>(ea);
+  verify_euler<EulerSystemXYX>(ea);
+  verify_euler<EulerSystemXZY>(ea);
+  verify_euler<EulerSystemXZX>(ea);
+  
+  verify_euler<EulerSystemYZX>(ea);
+  verify_euler<EulerSystemYZY>(ea);
+  verify_euler<EulerSystemYXZ>(ea);
+  verify_euler<EulerSystemYXY>(ea);
+  
+  verify_euler<EulerSystemZXY>(ea);
+  verify_euler<EulerSystemZXZ>(ea);
+  verify_euler<EulerSystemZYX>(ea);
+  verify_euler<EulerSystemZYZ>(ea);
+}
+
+template<typename Scalar> void eulerangles()
+{
+  typedef Matrix<Scalar,3,3> Matrix3;
+  typedef Matrix<Scalar,3,1> Vector3;
+  typedef Array<Scalar,3,1> Array3;
+  typedef Quaternion<Scalar> Quaternionx;
+  typedef AngleAxis<Scalar> AngleAxisType;
+
+  Scalar a = internal::random<Scalar>(-Scalar(EIGEN_PI), Scalar(EIGEN_PI));
+  Quaternionx q1;
+  q1 = AngleAxisType(a, Vector3::Random().normalized());
+  Matrix3 m;
+  m = q1;
+  
+  Vector3 ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with purely random Quaternion:
+  q1.coeffs() = Quaternionx::Coefficients::Random().normalized();
+  m = q1;
+  ea = m.eulerAngles(0,1,2);
+  check_all_var(ea);
+  ea = m.eulerAngles(0,1,0);
+  check_all_var(ea);
+  
+  // Check with random angles in range [0:pi]x[-pi:pi]x[-pi:pi].
+  ea = (Array3::Random() + Array3(1,0,0))*Scalar(EIGEN_PI)*Array3(0.5,1,1);
+  check_all_var(ea);
+  
+  ea[2] = ea[0] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[0] = ea[1] = internal::random<Scalar>(0,Scalar(EIGEN_PI));
+  check_all_var(ea);
+  
+  ea[1] = 0;
+  check_all_var(ea);
+  
+  ea.head(2).setZero();
+  check_all_var(ea);
+  
+  ea.setZero();
+  check_all_var(ea);
+}
+
+void test_EulerAngles()
+{
+  for(int i = 0; i < g_repeat; i++) {
+    CALL_SUBTEST_1( eulerangles<float>() );
+    CALL_SUBTEST_2( eulerangles<double>() );
+  }
+}
diff --git a/unsupported/test/cxx11_eventcount.cpp b/unsupported/test/cxx11_eventcount.cpp
index f16cc6f07..3b598bf42 100644
--- a/unsupported/test/cxx11_eventcount.cpp
+++ b/unsupported/test/cxx11_eventcount.cpp
@@ -25,7 +25,8 @@ int rand_reentrant(unsigned int* s) {
 
 static void test_basic_eventcount()
 {
-  std::vector<EventCount::Waiter> waiters(1);
+  MaxSizeVector<EventCount::Waiter> waiters(1);
+  waiters.resize(1);
   EventCount ec(waiters);
   EventCount::Waiter& w = waiters[0];
   ec.Notify(false);
@@ -81,7 +82,8 @@ static void test_stress_eventcount()
   static const int kEvents = 1 << 16;
   static const int kQueues = 10;
 
-  std::vector<EventCount::Waiter> waiters(kThreads);
+  MaxSizeVector<EventCount::Waiter> waiters(kThreads);
+  waiters.resize(kThreads);
   EventCount ec(waiters);
   TestQueue queues[kQueues];
 
diff --git a/unsupported/test/cxx11_tensor_argmax_cuda.cu b/unsupported/test/cxx11_tensor_argmax_cuda.cu
index 41ccbe974..6fe8982f2 100644
--- a/unsupported/test/cxx11_tensor_argmax_cuda.cu
+++ b/unsupported/test/cxx11_tensor_argmax_cuda.cu
@@ -12,6 +12,9 @@
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
 #define EIGEN_USE_GPU
 
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
index f22b99de8..88c233994 100644
--- a/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cast_float16_cuda.cu
@@ -13,7 +13,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/cxx11_tensor_complex_cuda.cu b/unsupported/test/cxx11_tensor_complex_cuda.cu
new file mode 100644
index 000000000..74befe670
--- /dev/null
+++ b/unsupported/test/cxx11_tensor_complex_cuda.cu
@@ -0,0 +1,78 @@
+// This file is part of Eigen, a lightweight C++ template library
+// for linear algebra.
+//
+// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
+//
+// This Source Code Form is subject to the terms of the Mozilla
+// Public License v. 2.0. If a copy of the MPL was not distributed
+// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
+
+#define EIGEN_TEST_NO_LONGDOUBLE
+#define EIGEN_TEST_FUNC cxx11_tensor_complex
+#define EIGEN_USE_GPU
+
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
+#include "main.h"
+#include <unsupported/Eigen/CXX11/Tensor>
+
+using Eigen::Tensor;
+
+void test_cuda_nullary() {
+  Tensor<std::complex<float>, 1, 0, int> in1(2);
+  Tensor<std::complex<float>, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t float_bytes = in1.size() * sizeof(float);
+  std::size_t complex_bytes = in1.size() * sizeof(std::complex<float>);
+
+  std::complex<float>* d_in1;
+  std::complex<float>* d_in2;
+  float* d_out2;
+  cudaMalloc((void**)(&d_in1), complex_bytes);
+  cudaMalloc((void**)(&d_in2), complex_bytes);
+  cudaMalloc((void**)(&d_out2), float_bytes);
+  cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_out2(
+      d_out2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(std::complex<float>(3.14f, 2.7f));
+  gpu_out2.device(gpu_device) = gpu_in2.abs();
+
+  Tensor<std::complex<float>, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, complex_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_out2, float_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), std::complex<float>(3.14f, 2.7f));
+    VERIFY_IS_APPROX(new2(i), std::abs(in2(i)));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+  cudaFree(d_out2);
+}
+
+
+
+void test_cxx11_tensor_complex()
+{
+  CALL_SUBTEST(test_cuda_nullary());
+}
diff --git a/unsupported/test/cxx11_tensor_contract_cuda.cu b/unsupported/test/cxx11_tensor_contract_cuda.cu
index 98ac180ef..767e9c678 100644
--- a/unsupported/test/cxx11_tensor_contract_cuda.cu
+++ b/unsupported/test/cxx11_tensor_contract_cuda.cu
@@ -14,7 +14,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/cxx11_tensor_contraction.cpp b/unsupported/test/cxx11_tensor_contraction.cpp
index 73623b2ed..ace97057f 100644
--- a/unsupported/test/cxx11_tensor_contraction.cpp
+++ b/unsupported/test/cxx11_tensor_contraction.cpp
@@ -489,6 +489,27 @@ static void test_tensor_product()
 }
 
 
+template<int DataLayout>
+static void test_const_inputs()
+{
+  Tensor<float, 2, DataLayout> in1(2, 3);
+  Tensor<float, 2, DataLayout> in2(3, 2);
+  in1.setRandom();
+  in2.setRandom();
+
+  TensorMap<Tensor<const float, 2, DataLayout> > mat1(in1.data(), 2, 3);
+  TensorMap<Tensor<const float, 2, DataLayout> > mat2(in2.data(), 3, 2);
+  Tensor<float, 2, DataLayout> mat3(2,2);
+
+  Eigen::array<DimPair, 1> dims = {{DimPair(1, 0)}};
+  mat3 = mat1.contract(mat2, dims);
+
+  VERIFY_IS_APPROX(mat3(0,0), mat1(0,0)*mat2(0,0) + mat1(0,1)*mat2(1,0) + mat1(0,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(0,1), mat1(0,0)*mat2(0,1) + mat1(0,1)*mat2(1,1) + mat1(0,2)*mat2(2,1));
+  VERIFY_IS_APPROX(mat3(1,0), mat1(1,0)*mat2(0,0) + mat1(1,1)*mat2(1,0) + mat1(1,2)*mat2(2,0));
+  VERIFY_IS_APPROX(mat3(1,1), mat1(1,0)*mat2(0,1) + mat1(1,1)*mat2(1,1) + mat1(1,2)*mat2(2,1));
+}
+
 void test_cxx11_tensor_contraction()
 {
   CALL_SUBTEST(test_evals<ColMajor>());
@@ -519,4 +540,6 @@ void test_cxx11_tensor_contraction()
   CALL_SUBTEST(test_small_blocking_factors<RowMajor>());
   CALL_SUBTEST(test_tensor_product<ColMajor>());
   CALL_SUBTEST(test_tensor_product<RowMajor>());
+  CALL_SUBTEST(test_const_inputs<ColMajor>());
+  CALL_SUBTEST(test_const_inputs<RowMajor>());
 }
diff --git a/unsupported/test/cxx11_tensor_cuda.cu b/unsupported/test/cxx11_tensor_cuda.cu
index 284b46803..bf216587a 100644
--- a/unsupported/test/cxx11_tensor_cuda.cu
+++ b/unsupported/test/cxx11_tensor_cuda.cu
@@ -10,19 +10,65 @@
 #define EIGEN_TEST_NO_LONGDOUBLE
 #define EIGEN_TEST_NO_COMPLEX
 #define EIGEN_TEST_FUNC cxx11_tensor_cuda
-#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 using Eigen::Tensor;
 
+void test_cuda_nullary() {
+  Tensor<float, 1, 0, int> in1(2);
+  Tensor<float, 1, 0, int> in2(2);
+  in1.setRandom();
+  in2.setRandom();
+
+  std::size_t tensor_bytes = in1.size() * sizeof(float);
+
+  float* d_in1;
+  float* d_in2;
+  cudaMalloc((void**)(&d_in1), tensor_bytes);
+  cudaMalloc((void**)(&d_in2), tensor_bytes);
+  cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice);
+
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice gpu_device(&stream);
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1(
+      d_in1, 2);
+  Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in2(
+      d_in2, 2);
+
+  gpu_in1.device(gpu_device) = gpu_in1.constant(3.14f);
+  gpu_in2.device(gpu_device) = gpu_in2.random();
+
+  Tensor<float, 1, 0, int> new1(2);
+  Tensor<float, 1, 0, int> new2(2);
+
+  assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+  assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost,
+                         gpu_device.stream()) == cudaSuccess);
+
+  assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess);
+
+  for (int i = 0; i < 2; ++i) {
+    VERIFY_IS_APPROX(new1(i), 3.14f);
+    VERIFY_IS_NOT_EQUAL(new2(i), in2(i));
+  }
+
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
 void test_cuda_elementwise_small() {
-  Tensor<float, 1> in1(Eigen::array<int, 1>(2));
-  Tensor<float, 1> in2(Eigen::array<int, 1>(2));
-  Tensor<float, 1> out(Eigen::array<int, 1>(2));
+  Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2));
+  Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2));
   in1.setRandom();
   in2.setRandom();
 
@@ -44,11 +90,11 @@ void test_cuda_elementwise_small() {
   Eigen::GpuDevice gpu_device(&stream);
 
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1(
-      d_in1, Eigen::array<int, 1>(2));
+      d_in1, Eigen::array<Eigen::DenseIndex, 1>(2));
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in2(
-      d_in2, Eigen::array<int, 1>(2));
+      d_in2, Eigen::array<Eigen::DenseIndex, 1>(2));
   Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_out(
-      d_out, Eigen::array<int, 1>(2));
+      d_out, Eigen::array<Eigen::DenseIndex, 1>(2));
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2;
 
@@ -58,8 +104,8 @@ void test_cuda_elementwise_small() {
 
   for (int i = 0; i < 2; ++i) {
     VERIFY_IS_APPROX(
-        out(Eigen::array<int, 1>(i)),
-        in1(Eigen::array<int, 1>(i)) + in2(Eigen::array<int, 1>(i)));
+        out(Eigen::array<Eigen::DenseIndex, 1>(i)),
+        in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i)));
   }
 
   cudaFree(d_in1);
@@ -69,10 +115,10 @@ void test_cuda_elementwise_small() {
 
 void test_cuda_elementwise()
 {
-  Tensor<float, 3> in1(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> in2(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> in3(Eigen::array<int, 3>(72,53,97));
-  Tensor<float, 3> out(Eigen::array<int, 3>(72,53,97));
+  Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> in3(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Tensor<float, 3> out(Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
   in1.setRandom();
   in2.setRandom();
   in3.setRandom();
@@ -98,10 +144,10 @@ void test_cuda_elementwise()
   Eigen::CudaStreamDevice stream;
   Eigen::GpuDevice gpu_device(&stream);
 
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<int, 3>(72,53,97));
-  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<int, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in3(d_in3, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
+  Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_out(d_out, Eigen::array<Eigen::DenseIndex, 3>(72,53,97));
 
   gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3;
 
@@ -111,7 +157,7 @@ void test_cuda_elementwise()
   for (int i = 0; i < 72; ++i) {
     for (int j = 0; j < 53; ++j) {
       for (int k = 0; k < 97; ++k) {
-        VERIFY_IS_APPROX(out(Eigen::array<int, 3>(i,j,k)), in1(Eigen::array<int, 3>(i,j,k)) + in2(Eigen::array<int, 3>(i,j,k)) * in3(Eigen::array<int, 3>(i,j,k)));
+        VERIFY_IS_APPROX(out(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)), in1(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) + in2(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)) * in3(Eigen::array<Eigen::DenseIndex, 3>(i,j,k)));
       }
     }
   }
@@ -181,7 +227,7 @@ void test_cuda_reduction()
   Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113);
   Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97);
 
-  array<int, 2> reduction_axis;
+  array<Eigen::DenseIndex, 2> reduction_axis;
   reduction_axis[0] = 1;
   reduction_axis[1] = 3;
 
@@ -214,8 +260,8 @@ void test_cuda_contraction()
   // more than 30 * 1024, which is the number of threads in blocks on
   // a 15 SM GK110 GPU
   Tensor<float, 4, DataLayout> t_left(6, 50, 3, 31);
-  Tensor<float, 5, DataLayout> t_right(Eigen::array<int, 5>(3, 31, 7, 20, 1));
-  Tensor<float, 5, DataLayout> t_result(Eigen::array<int, 5>(6, 50, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_right(Eigen::array<Eigen::DenseIndex, 5>(3, 31, 7, 20, 1));
+  Tensor<float, 5, DataLayout> t_result(Eigen::array<Eigen::DenseIndex, 5>(6, 50, 7, 20, 1));
 
   t_left.setRandom();
   t_right.setRandom();
@@ -299,7 +345,7 @@ void test_cuda_convolution_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, DataLayout> > gpu_kernel(d_kernel, 4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out, 74,34,11,137);
 
-  Eigen::array<int, 1> dims(1);
+  Eigen::array<Eigen::DenseIndex, 1> dims(1);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -352,7 +398,7 @@ void test_cuda_convolution_inner_dim_col_major_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, ColMajor> > gpu_kernel(d_kernel,4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_out(d_out,71,9,11,7);
 
-  Eigen::array<int, 1> dims(0);
+  Eigen::array<Eigen::DenseIndex, 1> dims(0);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -405,7 +451,7 @@ void test_cuda_convolution_inner_dim_row_major_1d()
   Eigen::TensorMap<Eigen::Tensor<float, 1, RowMajor> > gpu_kernel(d_kernel, 4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_out(d_out, 7,9,11,71);
 
-  Eigen::array<int, 1> dims(3);
+  Eigen::array<Eigen::DenseIndex, 1> dims(3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -459,7 +505,7 @@ void test_cuda_convolution_2d()
   Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > gpu_kernel(d_kernel,3,4);
   Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_out(d_out,74,35,8,137);
 
-  Eigen::array<int, 2> dims(1,2);
+  Eigen::array<Eigen::DenseIndex, 2> dims(1,2);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -496,9 +542,9 @@ void test_cuda_convolution_2d()
 template<int DataLayout>
 void test_cuda_convolution_3d()
 {
-  Tensor<float, 5, DataLayout> input(Eigen::array<int, 5>(74,37,11,137,17));
+  Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17));
   Tensor<float, 3, DataLayout> kernel(3,4,2);
-  Tensor<float, 5, DataLayout> out(Eigen::array<int, 5>(74,35,8,136,17));
+  Tensor<float, 5, DataLayout> out(Eigen::array<Eigen::DenseIndex, 5>(74,35,8,136,17));
   input = input.constant(10.0f) + input.random();
   kernel = kernel.constant(7.0f) + kernel.random();
 
@@ -523,7 +569,7 @@ void test_cuda_convolution_3d()
   Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > gpu_kernel(d_kernel,3,4,2);
   Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_out(d_out,74,35,8,136,17);
 
-  Eigen::array<int, 3> dims(1,2,3);
+  Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3);
   gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims);
 
   assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess);
@@ -1168,6 +1214,7 @@ void test_cuda_betainc()
 
 void test_cxx11_tensor_cuda()
 {
+  CALL_SUBTEST_1(test_cuda_nullary());
   CALL_SUBTEST_1(test_cuda_elementwise_small());
   CALL_SUBTEST_1(test_cuda_elementwise());
   CALL_SUBTEST_1(test_cuda_props());
diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu
index b6ca54d93..fde20ddf2 100644
--- a/unsupported/test/cxx11_tensor_device.cu
+++ b/unsupported/test/cxx11_tensor_device.cu
@@ -13,7 +13,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/cxx11_tensor_of_float16_cuda.cu b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
index 2f55f9361..cbf401c86 100644
--- a/unsupported/test/cxx11_tensor_of_float16_cuda.cu
+++ b/unsupported/test/cxx11_tensor_of_float16_cuda.cu
@@ -13,7 +13,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
@@ -181,30 +183,39 @@ void test_cuda_trancendental() {
 
   float* d_float1 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   float* d_float2 = (float*)gpu_device.allocate(num_elem * sizeof(float));
+  float* d_float3 = (float*)gpu_device.allocate(num_elem * sizeof(float));
   Eigen::half* d_res1_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   Eigen::half* d_res1_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   Eigen::half* d_res2_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
   Eigen::half* d_res2_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
-
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(
-      d_float1, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(
-      d_float2, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(
-      d_res1_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(
-      d_res1_float, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(
-      d_res2_half, num_elem);
-  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(
-      d_res2_float, num_elem);
+  Eigen::half* d_res3_half = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+  Eigen::half* d_res3_float = (Eigen::half*)gpu_device.allocate(num_elem * sizeof(Eigen::half));
+
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float1(d_float1, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float2(d_float2, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_float3(d_float3, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_half(d_res1_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res1_float(d_res1_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_half(d_res2_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res2_float(d_res2_float, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_half(d_res3_half, num_elem);
+  Eigen::TensorMap<Eigen::Tensor<Eigen::half, 1>, Eigen::Aligned> gpu_res3_float(d_res3_float, num_elem);
 
   gpu_float1.device(gpu_device) = gpu_float1.random() - gpu_float1.constant(0.5f);
   gpu_float2.device(gpu_device) = gpu_float2.random() + gpu_float1.constant(0.5f);
+  gpu_float3.device(gpu_device) = gpu_float3.random();
   gpu_res1_float.device(gpu_device) = gpu_float1.exp().cast<Eigen::half>();
   gpu_res2_float.device(gpu_device) = gpu_float2.log().cast<Eigen::half>();
-  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>().exp();
-  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>().log();
+  gpu_res3_float.device(gpu_device) = gpu_float3.log1p().cast<Eigen::half>();
+
+  gpu_res1_half.device(gpu_device) = gpu_float1.cast<Eigen::half>();
+  gpu_res1_half.device(gpu_device) = gpu_res1_half.exp();
+
+  gpu_res2_half.device(gpu_device) = gpu_float2.cast<Eigen::half>();
+  gpu_res2_half.device(gpu_device) = gpu_res2_half.log();
+
+  gpu_res3_half.device(gpu_device) = gpu_float3.cast<Eigen::half>();
+  gpu_res3_half.device(gpu_device) = gpu_res3_half.log1p();
 
   Tensor<float, 1> input1(num_elem);
   Tensor<Eigen::half, 1> half_prec1(num_elem);
@@ -212,12 +223,18 @@ void test_cuda_trancendental() {
   Tensor<float, 1> input2(num_elem);
   Tensor<Eigen::half, 1> half_prec2(num_elem);
   Tensor<Eigen::half, 1> full_prec2(num_elem);
+  Tensor<float, 1> input3(num_elem);
+  Tensor<Eigen::half, 1> half_prec3(num_elem);
+  Tensor<Eigen::half, 1> full_prec3(num_elem);
   gpu_device.memcpyDeviceToHost(input1.data(), d_float1, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(input2.data(), d_float2, num_elem*sizeof(float));
+  gpu_device.memcpyDeviceToHost(input3.data(), d_float3, num_elem*sizeof(float));
   gpu_device.memcpyDeviceToHost(half_prec1.data(), d_res1_half, num_elem*sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec1.data(), d_res1_float, num_elem*sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(half_prec2.data(), d_res2_half, num_elem*sizeof(Eigen::half));
   gpu_device.memcpyDeviceToHost(full_prec2.data(), d_res2_float, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(half_prec3.data(), d_res3_half, num_elem*sizeof(Eigen::half));
+  gpu_device.memcpyDeviceToHost(full_prec3.data(), d_res3_float, num_elem*sizeof(Eigen::half));
   gpu_device.synchronize();
 
   for (int i = 0; i < num_elem; ++i) {
@@ -231,12 +248,19 @@ void test_cuda_trancendental() {
     else
       VERIFY_IS_APPROX(full_prec2(i), half_prec2(i));
   }
+  for (int i = 0; i < num_elem; ++i) {
+    std::cout << "Checking elemwise plog1 " << i << " input = " << input3(i) << " full = " << full_prec3(i) << " half = " << half_prec3(i) << std::endl;
+    VERIFY_IS_APPROX(full_prec3(i), half_prec3(i));
+  }
   gpu_device.deallocate(d_float1);
   gpu_device.deallocate(d_float2);
+  gpu_device.deallocate(d_float3);
   gpu_device.deallocate(d_res1_half);
   gpu_device.deallocate(d_res1_float);
   gpu_device.deallocate(d_res2_half);
   gpu_device.deallocate(d_res2_float);
+  gpu_device.deallocate(d_res3_float);
+  gpu_device.deallocate(d_res3_half);
 }
 
 template<typename>
diff --git a/unsupported/test/cxx11_tensor_random_cuda.cu b/unsupported/test/cxx11_tensor_random_cuda.cu
index fa1a46732..b3be199e1 100644
--- a/unsupported/test/cxx11_tensor_random_cuda.cu
+++ b/unsupported/test/cxx11_tensor_random_cuda.cu
@@ -13,6 +13,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/cxx11_tensor_reduction_cuda.cu b/unsupported/test/cxx11_tensor_reduction_cuda.cu
index cad0c08e0..6858b43a7 100644
--- a/unsupported/test/cxx11_tensor_reduction_cuda.cu
+++ b/unsupported/test/cxx11_tensor_reduction_cuda.cu
@@ -12,11 +12,14 @@
 #define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda
 #define EIGEN_USE_GPU
 
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
 
-template<int DataLayout>
+template<typename Type, int DataLayout>
 static void test_full_reductions() {
 
   Eigen::CudaStreamDevice stream;
@@ -25,24 +28,24 @@ static void test_full_reductions() {
   const int num_rows = internal::random<int>(1024, 5*1024);
   const int num_cols = internal::random<int>(1024, 5*1024);
 
-  Tensor<float, 2, DataLayout> in(num_rows, num_cols);
+  Tensor<Type, 2, DataLayout> in(num_rows, num_cols);
   in.setRandom();
 
-  Tensor<float, 0, DataLayout> full_redux;
+  Tensor<Type, 0, DataLayout> full_redux;
   full_redux = in.sum();
 
-  std::size_t in_bytes = in.size() * sizeof(float);
-  std::size_t out_bytes = full_redux.size() * sizeof(float);
-  float* gpu_in_ptr = static_cast<float*>(gpu_device.allocate(in_bytes));
-  float* gpu_out_ptr = static_cast<float*>(gpu_device.allocate(out_bytes));
+  std::size_t in_bytes = in.size() * sizeof(Type);
+  std::size_t out_bytes = full_redux.size() * sizeof(Type);
+  Type* gpu_in_ptr = static_cast<Type*>(gpu_device.allocate(in_bytes));
+  Type* gpu_out_ptr = static_cast<Type*>(gpu_device.allocate(out_bytes));
   gpu_device.memcpyHostToDevice(gpu_in_ptr, in.data(), in_bytes);
 
-  TensorMap<Tensor<float, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
-  TensorMap<Tensor<float, 0, DataLayout> > out_gpu(gpu_out_ptr);
+  TensorMap<Tensor<Type, 2, DataLayout> > in_gpu(gpu_in_ptr, num_rows, num_cols);
+  TensorMap<Tensor<Type, 0, DataLayout> > out_gpu(gpu_out_ptr);
 
   out_gpu.device(gpu_device) = in_gpu.sum();
 
-  Tensor<float, 0, DataLayout> full_redux_gpu;
+  Tensor<Type, 0, DataLayout> full_redux_gpu;
   gpu_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_ptr, out_bytes);
   gpu_device.synchronize();
 
@@ -53,7 +56,102 @@ static void test_full_reductions() {
   gpu_device.deallocate(gpu_out_ptr);
 }
 
+template<typename Type, int DataLayout>
+static void test_first_dim_reductions() {
+  int dim_x = 33;
+  int dim_y = 1;
+  int dim_z = 128;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 0;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data(T)
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_z*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_y, dim_z);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_y, dim_z);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+template<typename Type, int DataLayout>
+static void test_last_dim_reductions() {
+  int dim_x = 128;
+  int dim_y = 1;
+  int dim_z = 33;
+
+  Tensor<Type, 3, DataLayout> in(dim_x, dim_y, dim_z);
+  in.setRandom();
+
+  Eigen::array<int, 1> red_axis;
+  red_axis[0] = 2;
+  Tensor<Type, 2, DataLayout> redux = in.sum(red_axis);
+
+  // Create device
+  Eigen::CudaStreamDevice stream;
+  Eigen::GpuDevice dev(&stream);
+  
+  // Create data
+  Type* in_data = (Type*)dev.allocate(dim_x*dim_y*dim_z*sizeof(Type));
+  Type* out_data = (Type*)dev.allocate(dim_x*dim_y*sizeof(Type));
+  Eigen::TensorMap<Eigen::Tensor<Type, 3, DataLayout> > gpu_in(in_data, dim_x, dim_y, dim_z);
+  Eigen::TensorMap<Eigen::Tensor<Type, 2, DataLayout> > gpu_out(out_data, dim_x, dim_y);
+  
+  // Perform operation
+  dev.memcpyHostToDevice(in_data, in.data(), in.size()*sizeof(Type));
+  gpu_out.device(dev) = gpu_in.sum(red_axis);
+  gpu_out.device(dev) += gpu_in.sum(red_axis);
+  Tensor<Type, 2, DataLayout> redux_gpu(dim_x, dim_y);
+  dev.memcpyDeviceToHost(redux_gpu.data(), out_data, gpu_out.size()*sizeof(Type));
+  dev.synchronize();
+
+  // Check that the CPU and GPU reductions return the same result.
+  for (int i = 0; i < gpu_out.size(); ++i) {
+    VERIFY_IS_APPROX(2*redux(i), redux_gpu(i));
+  }
+
+  dev.deallocate(in_data);
+  dev.deallocate(out_data);
+}
+
+
 void test_cxx11_tensor_reduction_cuda() {
-  CALL_SUBTEST_1(test_full_reductions<ColMajor>());
-  CALL_SUBTEST_2(test_full_reductions<RowMajor>());
+  CALL_SUBTEST_1((test_full_reductions<float, ColMajor>()));
+  CALL_SUBTEST_1((test_full_reductions<double, ColMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<float, RowMajor>()));
+  CALL_SUBTEST_2((test_full_reductions<double, RowMajor>()));
+  
+  CALL_SUBTEST_3((test_first_dim_reductions<float, ColMajor>()));
+  CALL_SUBTEST_3((test_first_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_4((test_first_dim_reductions<float, RowMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_4((test_first_dim_reductions<double, RowMajor>()))
+
+  CALL_SUBTEST_5((test_last_dim_reductions<float, ColMajor>()));
+// Outer reductions of doubles aren't supported just yet.  					      
+//  CALL_SUBTEST_5((test_last_dim_reductions<double, ColMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<float, RowMajor>()));
+  CALL_SUBTEST_6((test_last_dim_reductions<double, RowMajor>()));
 }
diff --git a/unsupported/test/cxx11_tensor_scan_cuda.cu b/unsupported/test/cxx11_tensor_scan_cuda.cu
index 35e19e51c..761d11fd1 100644
--- a/unsupported/test/cxx11_tensor_scan_cuda.cu
+++ b/unsupported/test/cxx11_tensor_scan_cuda.cu
@@ -13,7 +13,9 @@
 #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
 #define EIGEN_USE_GPU
 
-
+#if defined __CUDACC_VER__ && __CUDACC_VER__ >= 70500
+#include <cuda_fp16.h>
+#endif
 #include "main.h"
 #include <unsupported/Eigen/CXX11/Tensor>
 
diff --git a/unsupported/test/kronecker_product.cpp b/unsupported/test/kronecker_product.cpp
index 02411a262..e770049e5 100644
--- a/unsupported/test/kronecker_product.cpp
+++ b/unsupported/test/kronecker_product.cpp
@@ -9,12 +9,12 @@
 // Public License v. 2.0. If a copy of the MPL was not distributed
 // with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
 
+#ifdef EIGEN_TEST_PART_1
 
 #include "sparse.h"
 #include <Eigen/SparseExtra>
 #include <Eigen/KroneckerProduct>
 
-
 template<typename MatrixType>
 void check_dimension(const MatrixType& ab, const int rows,  const int cols)
 {
@@ -230,3 +230,23 @@ void test_kronecker_product()
     VERIFY_IS_APPROX(MatrixXf(sC2),dC);
   }
 }
+
+#endif
+
+#ifdef EIGEN_TEST_PART_2
+
+// simply check that for a dense kronecker product, sparse module is not needed
+
+#include "main.h"
+#include <Eigen/KroneckerProduct>
+
+void test_kronecker_product()
+{
+  MatrixXd a(2,2), b(3,3), c;
+  a.setRandom();
+  b.setRandom();
+  c = kroneckerProduct(a,b);
+  VERIFY_IS_APPROX(c.block(3,3,3,3), a(1,1)*b);
+}
+
+#endif
author	Luke Iwanski <luke@codeplay.com>	2016-09-19 14:03:54 +0100
committer	Luke Iwanski <luke@codeplay.com>	2016-09-19 14:03:54 +0100
commit	b91e0211727b9ea5d7c30908ed86afc4e50d4c6c (patch)
tree	4dd857d1aabdd4065e8b7dec1369a4bd06bac5e7 /unsupported
parent	cb81975714a96ecb2faf33ca242feeee3543b1db (diff)
parent	ff47717f25aeede4878f65b214cdce264b8314e8 (diff)