8 files changed, 159 insertions, 111 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt b/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
deleted file mode 100644
index 6d4b3ea0d..000000000
--- a/unsupported/Eigen/CXX11/src/Tensor/CMakeLists.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-FILE(GLOB Eigen_CXX11_Tensor_SRCS "*.h")
-
-INSTALL(FILES
-  ${Eigen_CXX11_Tensor_SRCS}
-  DESTINATION ${INCLUDE_INSTALL_DIR}/unsupported/Eigen/CXX11/src/Tensor COMPONENT Devel
-  )
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
index e3880d2e0..3c8710255 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorBase.h
@@ -192,6 +192,12 @@ class TensorBase<Derived, ReadOnlyAccessors>
     }
 
     EIGEN_DEVICE_FUNC
+    EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_log1p_op<Scalar>, const Derived>
+    log1p() const {
+      return unaryExpr(internal::scalar_log1p_op<Scalar>());
+    }
+
+    EIGEN_DEVICE_FUNC
     EIGEN_STRONG_INLINE const TensorCwiseUnaryOp<internal::scalar_abs_op<Scalar>, const Derived>
     abs() const {
       return unaryExpr(internal::scalar_abs_op<Scalar>());
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
index 56d9c2025..20b29e5fd 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContraction.h
@@ -25,8 +25,9 @@ template<typename Dimensions, typename LhsXprType, typename RhsXprType>
 struct traits<TensorContractionOp<Dimensions, LhsXprType, RhsXprType> >
 {
   // Type promotion to handle the case where the types of the lhs and the rhs are different.
-  typedef typename internal::promote_storage_type<typename LhsXprType::Scalar,
-                                                  typename RhsXprType::Scalar>::ret Scalar;
+  typedef typename gebp_traits<typename remove_const<typename LhsXprType::Scalar>::type,
+                               typename remove_const<typename RhsXprType::Scalar>::type>::ResScalar Scalar;
+
   typedef typename promote_storage_type<typename traits<LhsXprType>::StorageKind,
                                         typename traits<RhsXprType>::StorageKind>::ret StorageKind;
   typedef typename promote_index_type<typename traits<LhsXprType>::Index,
@@ -75,8 +76,8 @@ class TensorContractionOp : public TensorBase<TensorContractionOp<Indices, LhsXp
 {
   public:
   typedef typename Eigen::internal::traits<TensorContractionOp>::Scalar Scalar;
-  typedef typename internal::promote_storage_type<typename LhsXprType::CoeffReturnType,
-                                                  typename RhsXprType::CoeffReturnType>::ret CoeffReturnType;
+  typedef typename internal::gebp_traits<typename LhsXprType::CoeffReturnType,
+                                                   typename RhsXprType::CoeffReturnType>::ResScalar CoeffReturnType;
   typedef typename Eigen::internal::nested<TensorContractionOp>::type Nested;
   typedef typename Eigen::internal::traits<TensorContractionOp>::StorageKind StorageKind;
   typedef typename Eigen::internal::traits<TensorContractionOp>::Index Index;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
index a76c8ca35..d66e45d50 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorCostModel.h
@@ -91,21 +91,21 @@ class TensorOpCost {
   }
 
   // TODO(rmlarsen): Define min in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMin(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::mini(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::mini(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::mini(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMin(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::mini(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::mini(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::mini(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   // TODO(rmlarsen): Define max in terms of total cost, not elementwise.
-  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& cwiseMax(
-      const TensorOpCost& rhs) {
-    bytes_loaded_ = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
-    bytes_stored_ = numext::maxi(bytes_stored_, rhs.bytes_stored());
-    compute_cycles_ = numext::maxi(compute_cycles_, rhs.compute_cycles());
-    return *this;
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost cwiseMax(
+      const TensorOpCost& rhs) const {
+    double bytes_loaded = numext::maxi(bytes_loaded_, rhs.bytes_loaded());
+    double bytes_stored = numext::maxi(bytes_stored_, rhs.bytes_stored());
+    double compute_cycles = numext::maxi(compute_cycles_, rhs.compute_cycles());
+    return TensorOpCost(bytes_loaded, bytes_stored, compute_cycles);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost& operator+=(
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
index b2b4bcf62..834ce07df 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorEvaluator.h
@@ -239,7 +239,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC
   TensorEvaluator(const XprType& op, const Device& device)
-      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device)
+      : m_functor(op.functor()), m_argImpl(op.nestedExpression(), device), m_wrapper()
   { }
 
   typedef typename XprType::Index Index;
@@ -256,13 +256,13 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
 
   EIGEN_DEVICE_FUNC CoeffReturnType coeff(Index index) const
   {
-    return m_functor(index);
+    return m_wrapper(m_functor, index);
   }
 
   template<int LoadMode>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE PacketReturnType packet(Index index) const
   {
-    return m_functor.template packetOp<Index, PacketReturnType>(index);
+    return m_wrapper.template packetOp<PacketReturnType, Index>(m_functor, index);
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE TensorOpCost
@@ -282,6 +282,7 @@ struct TensorEvaluator<const TensorCwiseNullaryOp<NullaryOp, ArgType>, Device>
  private:
   const NullaryOp m_functor;
   TensorEvaluator<ArgType, Device> m_argImpl;
+  const internal::nullary_wrapper<CoeffReturnType,NullaryOp> m_wrapper;
 };
 
 
@@ -612,7 +613,7 @@ struct TensorEvaluator<const TensorSelectOp<IfArgType, ThenArgType, ElseArgType>
         .cwiseMax(m_elseImpl.costPerCoeff(vectorized));
   }
 
-  EIGEN_DEVICE_FUNC CoeffReturnType* data() const { return NULL; }
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE CoeffReturnType* data() const { return NULL; }
   /// required by sycl in order to extract the accessor
   const TensorEvaluator<IfArgType, Device> & cond_impl() const { return m_condImpl; }
   /// required by sycl in order to extract the accessor
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index a8e48fced..fc75dbb5c 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -25,7 +25,7 @@ struct scalar_mod_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 
 /** \internal
@@ -38,7 +38,7 @@ struct scalar_mod2_op {
 };
 template <typename Scalar>
 struct functor_traits<scalar_mod2_op<Scalar> >
-{ enum { Cost = NumTraits<Scalar>::template Div<false>::Cost, PacketAccess = false }; };
+{ enum { Cost = scalar_div_cost<Scalar,false>::value, PacketAccess = false }; };
 
 template <typename Scalar>
 struct scalar_fmod_op {
@@ -188,6 +188,32 @@ struct reducer_traits<MeanReducer<T>, Device> {
 };
 
 
+template <typename T, bool IsMax = true, bool IsInteger = true>
+struct MinMaxBottomValue {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return Eigen::NumTraits<T>::lowest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, true, false> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return -Eigen::NumTraits<T>::infinity();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, true> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return Eigen::NumTraits<T>::highest();
+  }
+};
+template <typename T>
+struct MinMaxBottomValue<T, false, false> {
+  EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE static T bottom_value() {
+    return Eigen::NumTraits<T>::infinity();
+  }
+};
+
+
 template <typename T> struct MaxReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -200,9 +226,8 @@ template <typename T> struct MaxReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmax<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::lowest();
+    return MinMaxBottomValue<T, true, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -242,9 +267,8 @@ template <typename T> struct MinReducer
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reducePacket(const Packet& p, Packet* accum) const {
     (*accum) = pmin<Packet>(*accum, p);
   }
-
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE T initialize() const {
-    return Eigen::NumTraits<T>::highest();
+    return MinMaxBottomValue<T, false, Eigen::NumTraits<T>::IsInteger>::bottom_value();
   }
   template <typename Packet>
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE Packet initializePacket() const {
@@ -454,12 +478,11 @@ template <typename T> class UniformRandomGenerator {
     m_deterministic = other.m_deterministic;
   }
 
-  template<typename Index>
-  T operator()(Index) const {
+  T operator()() const {
     return random<T>();
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
@@ -484,23 +507,22 @@ template <> class UniformRandomGenerator<float> {
   }
   UniformRandomGenerator(const UniformRandomGenerator<float>& other) {
     m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
+    m_generator->seed(other() * UINT_MAX);
     m_deterministic = other.m_deterministic;
   }
   ~UniformRandomGenerator() {
     delete m_generator;
   }
 
-  template<typename Index>
-  float operator()(Index) const {
+  float operator()() const {
     return m_distribution(*m_generator);
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX float values[packetSize];
     for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
+      values[k] = this->operator()();
     }
     return internal::pload<PacketType>(values);
   }
@@ -525,23 +547,22 @@ template <> class UniformRandomGenerator<double> {
   }
   UniformRandomGenerator(const UniformRandomGenerator<double>& other) {
     m_generator = new std::mt19937();
-    m_generator->seed(other(0) * UINT_MAX);
+    m_generator->seed(other() * UINT_MAX);
     m_deterministic = other.m_deterministic;
   }
   ~UniformRandomGenerator() {
     delete m_generator;
   }
 
-  template<typename Index>
-  double operator()(Index) const {
+  double operator()() const {
     return m_distribution(*m_generator);
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index i) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX double values[packetSize];
     for (int k = 0; k < packetSize; ++k) {
-      values[k] = this->operator()(i);
+      values[k] = this->operator()();
     }
     return internal::pload<PacketType>(values);
   }
@@ -578,12 +599,11 @@ template <> class UniformRandomGenerator<float> {
      curand_init(seed, tid, 0, &m_state);
   }
 
-  template<typename Index>
-  __device__ float operator()(Index) const {
+  __device__ float operator()() const {
     return curand_uniform(&m_state);
   }
-  template<typename Index, typename PacketType>
-  __device__ float4 packetOp(Index) const {
+  template<typename PacketType>
+  __device__ float4 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_uniform4(&m_state);
   }
@@ -608,12 +628,11 @@ template <> class UniformRandomGenerator<double> {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ double operator()(Index) const {
+  __device__ double operator()() const {
     return curand_uniform_double(&m_state);
   }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
+  template<typename PacketType>
+  __device__ double2 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_uniform2_double(&m_state);
   }
@@ -638,8 +657,7 @@ template <> class UniformRandomGenerator<std::complex<float> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
+  __device__ std::complex<float> operator()() const {
     float4 vals = curand_uniform4(&m_state);
     return std::complex<float>(vals.x, vals.y);
   }
@@ -664,8 +682,7 @@ template <> class UniformRandomGenerator<std::complex<double> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
+  __device__ std::complex<double> operator()() const {
     double2 vals = curand_uniform2_double(&m_state);
     return std::complex<double>(vals.x, vals.y);
   }
@@ -701,17 +718,16 @@ template <typename T> class NormalRandomGenerator {
   }
   NormalRandomGenerator(const NormalRandomGenerator& other)
       : m_deterministic(other.m_deterministic), m_distribution(other.m_distribution), m_generator(new std::mt19937()) {
-    m_generator->seed(other(0) * UINT_MAX);
+    m_generator->seed(other() * UINT_MAX);
   }
   ~NormalRandomGenerator() {
     delete m_generator;
   }
-  template<typename Index>
-  T operator()(Index) const {
+  T operator()() const {
     return m_distribution(*m_generator);
   }
-  template<typename Index, typename PacketType>
-  PacketType packetOp(Index) const {
+  template<typename PacketType>
+  PacketType packetOp() const {
     const int packetSize = internal::unpacket_traits<PacketType>::size;
     EIGEN_ALIGN_MAX T values[packetSize];
     for (int i = 0; i < packetSize; ++i) {
@@ -749,12 +765,11 @@ template <> class NormalRandomGenerator<float> {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ float operator()(Index) const {
+  __device__ float operator()() const {
     return curand_normal(&m_state);
   }
-  template<typename Index, typename PacketType>
-   __device__ float4 packetOp(Index) const {
+  template<typename PacketType>
+   __device__ float4 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, float4>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_normal4(&m_state);
   }
@@ -779,12 +794,11 @@ template <> class NormalRandomGenerator<double> {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ double operator()(Index) const {
+  __device__ double operator()() const {
     return curand_normal_double(&m_state);
   }
-  template<typename Index, typename PacketType>
-  __device__ double2 packetOp(Index) const {
+  template<typename PacketType>
+  __device__ double2 packetOp() const {
     EIGEN_STATIC_ASSERT((is_same<PacketType, double2>::value), YOU_MADE_A_PROGRAMMING_MISTAKE);
     return curand_normal2_double(&m_state);
   }
@@ -809,8 +823,7 @@ template <> class NormalRandomGenerator<std::complex<float> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<float> operator()(Index) const {
+  __device__ std::complex<float> operator()() const {
     float4 vals = curand_normal4(&m_state);
     return std::complex<float>(vals.x, vals.y);
   }
@@ -835,8 +848,7 @@ template <> class NormalRandomGenerator<std::complex<double> > {
     const int seed = m_deterministic ? 0 : get_random_seed();
     curand_init(seed, tid, 0, &m_state);
   }
-  template<typename Index>
-  __device__ std::complex<double> operator()(Index) const {
+  __device__ std::complex<double> operator()() const {
     double2 vals = curand_normal2_double(&m_state);
     return std::complex<double>(vals.x, vals.y);
   }
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 9df697e4c..a87777b22 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -505,9 +505,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           (reducing_inner_dims || ReducingInnerMostDims)) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
-          data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-          m_result = data;
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 128) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
         }
         Op reducer(m_reducer);
         if (internal::InnerReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
@@ -533,9 +538,14 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
           preserving_inner_dims) {
         const Index num_values_to_reduce = internal::array_prod(m_reducedDims);
         const Index num_coeffs_to_preserve = internal::array_prod(m_dimensions);
-        if (!data && num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
-          data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
-          m_result = data;
+        if (!data) {
+          if (num_coeffs_to_preserve < 1024 && num_values_to_reduce > num_coeffs_to_preserve && num_values_to_reduce > 32) {
+            data = static_cast<CoeffReturnType*>(m_device.allocate(sizeof(CoeffReturnType) * num_coeffs_to_preserve));
+            m_result = data;
+          }
+          else {
+            return true;
+          }
         }
         Op reducer(m_reducer);
         if (internal::OuterReducer<Self, Op, Device>::run(*this, reducer, m_device, data, num_values_to_reduce, num_coeffs_to_preserve)) {
@@ -556,6 +566,7 @@ struct TensorEvaluator<const TensorReductionOp<Op, Dims, ArgType>, Device>
     m_impl.cleanup();
     if (m_result) {
       m_device.deallocate(m_result);
+      m_result = NULL;
     }
   }
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 5e512490c..65638b6a8 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -67,11 +67,21 @@ __device__ EIGEN_ALWAYS_INLINE void atomicReduce(T* output, T accum, R& reducer)
 #endif
 }
 
+// We extend atomicExch to support extra data types
+template <typename Type>
+__device__ inline Type atomicExchCustom(Type* address, Type val) {
+  return atomicExch(address, val);
+}
+
+template <>
+__device__ inline double atomicExchCustom(double* address, double val) {
+  unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(address);
+  return __longlong_as_double(atomicExch(address_as_ull, __double_as_longlong(val)));
+}
 
 #ifdef EIGEN_HAS_CUDA_FP16
 template <template <typename T> class R>
 __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer) {
-#if __CUDA_ARCH__ >= 300
   unsigned int oldval = *reinterpret_cast<unsigned int*>(output);
   unsigned int newval = oldval;
   reducer.reducePacket(accum, reinterpret_cast<half2*>(&newval));
@@ -87,9 +97,6 @@ __device__ inline void atomicReduce(half2* output, half2 accum, R<half>& reducer
       return;
     }
   }
-#else
-  assert(0 && "Shouldn't be called on unsupported device");
-#endif
 }
 #endif
 
@@ -130,7 +137,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
       unsigned int block = atomicCAS(semaphore, 0u, 1u);
       if (block == 0) {
         // We're the first block to run, initialize the output value
-        atomicExch(output, reducer.initialize());
+        atomicExchCustom(output, reducer.initialize());
         __threadfence();
         atomicExch(semaphore, 2u);
       }
@@ -263,17 +270,22 @@ __global__ void ReductionCleanupKernelHalfFloat(Op& reducer, half* output, half2
 
 #endif
 
-
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct FullReductionLauncher {
   static void run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index) {
-    assert(false && "Should only be called on floats and half floats");
+    assert(false && "Should only be called on doubles, floats and half floats");
   }
 };
 
-template <typename Self, typename Op, bool PacketAccess>
-struct FullReductionLauncher<Self, Op, float, PacketAccess> {
-  static void run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct FullReductionLauncher<
+    Self, Op, OutputType, PacketAccess,
+    typename internal::enable_if<
+      internal::is_same<float, OutputType>::value ||
+      internal::is_same<double, OutputType>::value,
+    void>::type> {
+  static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs) {
     typedef typename Self::Index Index;
     typedef typename Self::CoeffReturnType Scalar;
     const int block_size = 256;
@@ -330,20 +342,22 @@ struct FullReductionLauncher<Self, Op, Eigen::half, true> {
 template <typename Self, typename Op, bool Vectorizable>
 struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,
-  // so reduce the scope of the optimized version of the code to the simple case
-  // of floats and half floats.
+  // so reduce the scope of the optimized version of the code to the simple cases
+  // of doubles, floats and half floats
 #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+                                                (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                 internal::is_same<typename Self::CoeffReturnType, double>::value);
 #endif
 
   template <typename OutputType>
   static void run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     // Don't crash when we're called with an input tensor of size 0.
     if (num_coeffs == 0) {
@@ -360,6 +374,7 @@ template <int NumPerThread, typename Self,
 __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                          typename Self::CoeffReturnType* output) {
 #if __CUDA_ARCH__ >= 300
+  typedef typename Self::CoeffReturnType Type;
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
   eigen_assert(gridDim.y == 1);
@@ -389,13 +404,13 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
       const Index col_block = i % input_col_blocks;
       const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
 
-      float reduced_val = reducer.initialize();
+      Type reduced_val = reducer.initialize();
 
       for (Index j = 0; j < NumPerThread; j += unroll_times) {
         const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
         if (last_col >= num_coeffs_to_reduce) {
           for (Index col = col_begin + blockDim.x * j; col < num_coeffs_to_reduce; col += blockDim.x) {
-            const float val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
+            const Type val = input.m_impl.coeff(row * num_coeffs_to_reduce + col);
             reducer.reduce(val, &reduced_val);
           }
           break;
@@ -521,17 +536,23 @@ __global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input,
 
 #endif
 
-template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+template <typename Self, typename Op, typename OutputType, bool PacketAccess, typename Enabled = void>
 struct InnerReductionLauncher {
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const GpuDevice&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats and half floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles, floats and half floats on a gpu device");
     return true;
   }
 };
 
-template <typename Self, typename Op, bool PacketAccess>
-struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
-  static bool run(const Self& self, Op& reducer, const GpuDevice& device, float* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
+// Specialization for float and double
+template <typename Self, typename Op, typename OutputType, bool PacketAccess>
+struct InnerReductionLauncher<
+  Self, Op, OutputType, PacketAccess,
+  typename internal::enable_if<
+    internal::is_same<float, OutputType>::value ||
+    internal::is_same<double, OutputType>::value,
+  void>::type> {
+  static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
     typedef typename Self::Index Index;
 
     const Index num_coeffs = num_coeffs_to_reduce * num_preserved_vals;
@@ -549,7 +570,7 @@ struct InnerReductionLauncher<Self, Op, float, PacketAccess> {
       const int max_blocks = device.getNumCudaMultiProcessors() *
                            device.maxCudaThreadsPerMultiProcessor() / 1024;
       const int num_blocks = numext::mini<int>(max_blocks, dyn_blocks);
-      LAUNCH_CUDA_KERNEL((ReductionInitKernel<float, Index>),
+      LAUNCH_CUDA_KERNEL((ReductionInitKernel<OutputType, Index>),
                          num_blocks, 1024, 0, device, reducer.initialize(),
                          num_preserved_vals, output);
     }
@@ -616,15 +637,17 @@ struct InnerReducer<Self, Op, GpuDevice> {
 #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+       internal::is_same<typename Self::CoeffReturnType, double>::value ||
        (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
 #endif
 
   template <typename OutputType>
   static bool run(const Self& self, Op& reducer, const GpuDevice& device, OutputType* output, typename Self::Index num_coeffs_to_reduce, typename Self::Index num_preserved_vals) {
-    assert(HasOptimizedImplementation && "Should only be called on floats or half floats");
+    assert(HasOptimizedImplementation && "Should only be called on doubles, floats or half floats");
     const Index num_coeffs = array_prod(self.m_impl.dimensions());
     // Don't crash when we're called with an input tensor of size 0.
     if (num_coeffs == 0) {
@@ -675,11 +698,11 @@ struct OuterReducer<Self, Op, GpuDevice> {
   // so reduce the scope of the optimized version of the code to the simple case
   // of floats.
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
-                                                 internal::is_same<typename Self::CoeffReturnType, float>::value;
-
+                                                 (internal::is_same<typename Self::CoeffReturnType, float>::value ||
+                                                  internal::is_same<typename Self::CoeffReturnType, double>::value);
   template <typename Device, typename OutputType>
   static EIGEN_DEVICE_FUNC bool run(const Self&, Op&, const Device&, OutputType*, typename Self::Index, typename Self::Index) {
-    assert(false && "Should only be called to reduce floats on a gpu device");
+    assert(false && "Should only be called to reduce doubles or floats on a gpu device");
     return true;
   }