6 files changed, 107 insertions, 16 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index a60a17049..ee16cde9b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -202,7 +202,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     // across k dimension.
     const TensorOpCost cost =
         contractionCost(m, n, bm, bn, bk, shard_by_col, false);
-    Index num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
+    int num_threads = TensorCostModel<ThreadPoolDevice>::numThreads(
         static_cast<double>(n) * m, cost, this->m_device.numThreads());
 
     // TODO(dvyukov): this is a stop-gap to prevent regressions while the cost
@@ -301,7 +301,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   class Context {
    public:
     Context(const Device& device, int num_threads, LhsMapper& lhs,
-            RhsMapper& rhs, Scalar* buffer, Index m, Index n, Index k, Index bm,
+            RhsMapper& rhs, Scalar* buffer, Index tm, Index tn, Index tk, Index bm,
             Index bn, Index bk, Index nm, Index nn, Index nk, Index gm,
             Index gn, Index nm0, Index nn0, bool shard_by_col,
             bool parallel_pack)
@@ -309,13 +309,13 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
           lhs_(lhs),
           rhs_(rhs),
           buffer_(buffer),
-          output_(buffer, m),
+          output_(buffer, tm),
           num_threads_(num_threads),
           shard_by_col_(shard_by_col),
           parallel_pack_(parallel_pack),
-          m_(m),
-          n_(n),
-          k_(k),
+          m_(tm),
+          n_(tn),
+          k_(tk),
           bm_(bm),
           bn_(bn),
           bk_(bk),
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index d31b0ad38..c770d024f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -106,7 +106,7 @@ static EIGEN_STRONG_INLINE void wait_until_ready(SyncType* n) {
 // Build a thread pool device on top the an existing pool of threads.
 struct ThreadPoolDevice {
   // The ownership of the thread pool remains with the caller.
-  ThreadPoolDevice(ThreadPoolInterface* pool, size_t num_cores) : pool_(pool), num_threads_(num_cores) { }
+  ThreadPoolDevice(ThreadPoolInterface* pool, int num_cores) : pool_(pool), num_threads_(num_cores) { }
 
   EIGEN_STRONG_INLINE void* allocate(size_t num_bytes) const {
     return internal::aligned_malloc(num_bytes);
@@ -130,7 +130,7 @@ struct ThreadPoolDevice {
     ::memset(buffer, c, n);
   }
 
-  EIGEN_STRONG_INLINE size_t numThreads() const {
+  EIGEN_STRONG_INLINE int numThreads() const {
     return num_threads_;
   }
 
@@ -182,7 +182,7 @@ struct ThreadPoolDevice {
                    std::function<void(Index, Index)> f) const {
     typedef TensorCostModel<ThreadPoolDevice> CostModel;
     if (n <= 1 || numThreads() == 1 ||
-        CostModel::numThreads(n, cost, numThreads()) == 1) {
+        CostModel::numThreads(n, cost, static_cast<int>(numThreads())) == 1) {
       f(0, n);
       return;
     }
@@ -242,7 +242,7 @@ struct ThreadPoolDevice {
     // Recursively divide size into halves until we reach block_size.
     // Division code rounds mid to block_size, so we are guaranteed to get
     // block_count leaves that do actual computations.
-    Barrier barrier(block_count);
+    Barrier barrier(static_cast<unsigned int>(block_count));
     std::function<void(Index, Index)> handleRange;
     handleRange = [=, &handleRange, &barrier, &f](Index first, Index last) {
       if (last - first <= block_size) {
@@ -268,7 +268,7 @@ struct ThreadPoolDevice {
 
  private:
   ThreadPoolInterface* pool_;
-  size_t num_threads_;
+  int num_threads_;
 };
 
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
index 3dd32e9d1..bf52e490f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorFunctors.h
@@ -84,6 +84,14 @@ struct functor_traits<scalar_sigmoid_op<T> > {
 };
 
 
+template<typename Reducer, typename Device>
+struct reducer_traits {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
 // Standard reduction functors
 template <typename T> struct SumReducer
 {
@@ -119,6 +127,15 @@ template <typename T> struct SumReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<SumReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasAdd
+  };
+};
+
+
 template <typename T> struct MeanReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasAdd && !NumTraits<T>::IsInteger;
@@ -162,6 +179,15 @@ template <typename T> struct MeanReducer
     DenseIndex packetCount_;
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MeanReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasAdd
+  };
+};
+
+
 template <typename T> struct MaxReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMax;
@@ -195,6 +221,15 @@ template <typename T> struct MaxReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MaxReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasMax
+  };
+};
+
+
 template <typename T> struct MinReducer
 {
   static const bool PacketAccess = packet_traits<T>::HasMin;
@@ -228,6 +263,14 @@ template <typename T> struct MinReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<MinReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = PacketType<T, Device>::type::HasMin
+  };
+};
+
 
 template <typename T> struct ProdReducer
 {
@@ -263,6 +306,14 @@ template <typename T> struct ProdReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ProdReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::MulCost,
+    PacketAccess = PacketType<T, Device>::type::HasMul
+  };
+};
+
 
 struct AndReducer
 {
@@ -280,6 +331,15 @@ struct AndReducer
   }
 };
 
+template <typename Device>
+struct reducer_traits<AndReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 struct OrReducer {
   static const bool PacketAccess = false;
   static const bool IsStateful = false;
@@ -295,6 +355,15 @@ struct OrReducer {
   }
 };
 
+template <typename Device>
+struct reducer_traits<OrReducer, Device> {
+  enum {
+    Cost = 1,
+    PacketAccess = false
+  };
+};
+
+
 // Argmin/Argmax reducers
 template <typename T> struct ArgMaxTupleReducer
 {
@@ -312,6 +381,15 @@ template <typename T> struct ArgMaxTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMaxTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
+
 template <typename T> struct ArgMinTupleReducer
 {
   static const bool PacketAccess = false;
@@ -328,6 +406,14 @@ template <typename T> struct ArgMinTupleReducer
   }
 };
 
+template <typename T, typename Device>
+struct reducer_traits<ArgMinTupleReducer<T>, Device> {
+  enum {
+    Cost = NumTraits<T>::AddCost,
+    PacketAccess = false
+  };
+};
+
 
 // Random number generation
 namespace {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
index b1645d56f..82a905a65 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorMeta.h
@@ -55,6 +55,11 @@ struct PacketType {
 // For CUDA packet types when using a GpuDevice
 #if defined(EIGEN_USE_GPU) && defined(__CUDACC__)
 template <>
+  struct PacketType<half, GpuDevice> {
+  typedef half2 type;
+  static const int size = 2;
+ };
+template <>
 struct PacketType<float, GpuDevice> {
   typedef float4 type;
   static const int size = 4;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index e82530955..1b4fdd03f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -331,7 +331,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
   #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -346,7 +346,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
       return;
     }
 
-    FullReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs);
+    FullReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs);
   }
 };
 
@@ -608,7 +608,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
 #ifdef EIGEN_HAS_CUDA_FP16
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
       (internal::is_same<typename Self::CoeffReturnType, float>::value ||
-       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && Op::PacketAccess));
+       (internal::is_same<typename Self::CoeffReturnType, Eigen::half>::value && reducer_traits<Op, GpuDevice>::PacketAccess));
 #else
   static const bool HasOptimizedImplementation = !Op::IsStateful &&
                                                  internal::is_same<typename Self::CoeffReturnType, float>::value;
@@ -627,7 +627,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
       return true;
     }
 
-    return InnerReductionLauncher<Self, Op, OutputType, Op::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
+    return InnerReductionLauncher<Self, Op, OutputType, reducer_traits<Op, GpuDevice>::PacketAccess>::run(self, reducer, device, output, num_coeffs_to_reduce, num_preserved_vals);
   }
 };
 
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 61df8032d..0d084141d 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -122,7 +122,7 @@ struct TensorEvaluator<const TensorScanOp<Op, ArgType>, Device> {
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE const Dimensions& dimensions() const {
-      return m_dimensions;
+    return m_dimensions;
   }
 
   EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE bool evalSubExprsIfNeeded(Scalar* data) {