3 files changed, 94 insertions, 6 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index b33ab962e..88d485f38 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,7 +15,7 @@
 
 namespace Eigen {
 
-#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
 namespace internal {
 
 template<typename LhsScalar, typename LhsMapper, typename Index>
@@ -54,7 +54,7 @@ struct packRhsAndKernelArg {
 };
 
 }  // end namespace internal
-#endif  // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
 
 template<typename Indices, typename LeftArgType, typename RightArgType>
 struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
   TensorEvaluator(const XprType& op, const Device& device) :
       Base(op, device) {}
 
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
             bool rhs_inner_dim_reordered, int Alignment>
   void evalProduct(Scalar* buffer) const {
@@ -731,7 +731,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
     return 0;
   }
 
-#else  // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#else  // EIGEN_USE_SIMPLE_THREAD_POOL
 
   template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
   void evalProduct(Scalar* buffer) const {
@@ -1007,7 +1007,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
       }
     }
   }
-#endif  // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#endif  // EIGEN_USE_SIMPLE_THREAD_POOL
 
   TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
                                bool shard_by_col, bool prepacked) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 4df4cc220..5846a5e1b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -14,7 +14,7 @@ namespace Eigen {
 
 // Use the SimpleThreadPool by default. We'll switch to the new non blocking
 // thread pool later.
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
 template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
 typedef NonBlockingThreadPool ThreadPool;
 #else
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 8c2baec14..63646dfc2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -360,6 +360,94 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
   }
 }
 
+#ifdef EIGEN_HAS_CUDA_FP16
+/*
+template <int NumPerThread, typename Self,
+          typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+                                              half* output, half2* scratch) {
+  eigen_assert(blockDim.y == 1);
+  eigen_assert(blockDim.z == 1);
+  eigen_assert(gridDim.y == 1);
+  eigen_assert(gridDim.z == 1);
+
+  const int unroll_times = 16;
+  eigen_assert(NumPerThread % unroll_times == 0);
+  eigen_assert(unroll_times % 2 == 0);
+
+  const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+  const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+  const Index num_threads = blockDim.x * gridDim.x;
+  const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // Initialize the output values if they weren't initialized by the ReductionInitKernel
+  if (gridDim.x == 1) {
+    Index i = thread_id;
+    for (; i < num_preserved_coeffs; i += 2*num_threads) {
+      ((half2*)output)[i] = reducer.initializePacket();
+    }
+    if (i + 1 < num_preserved_coeffs) {
+      output[i] = reducer.initialize();
+    }
+    __syncthreads();
+  }
+
+  for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+    const Index row = i / input_col_blocks;
+
+    if (row + 1 < num_preserved_coeffs) {
+      const Index col_block = i % input_col_blocks;
+      const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+      half2 reduced_val1 = reducer.initializePacket();
+      half2 reduced_val2 = reducer.initializePacket();
+
+      for (Index j = 0; j < NumPerThread; j += unroll_times) {
+        const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+        if (last_col >= num_coeffs_to_reduce) {
+          Index col = col_begin + blockDim.x * j;
+          for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+            const half2 val = input.m_impl.packet(row * num_coeffs_to_reduce + col);
+            reducer.reduce(val, &reduced_val);
+            // do the same for reduce val2 here
+          }
+          if (col < num_coeffs_to_reduce) {
+            // Peel;
+            const half last = input.m_impl.coeff(row * num_coeffs_to_reduce + col+1);
+            const half2 val = __halves2half2(last, reducer.initialize());
+            reducer.reducePacket(val, &reduced_val);
+          }
+          break;
+        } else {
+          // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+          for (int k = 0; k < unroll_times; ++k) {
+            const Index col = col_begin + blockDim.x * (j + k);
+            reducer.reduce(input.m_impl.packet(row * num_coeffs_to_reduce + col), &reduced_val);
+          }
+        }
+      }
+
+#pragma unroll
+      for (int offset = warpSize/2; offset > 0; offset /= 2) {
+        reducer.reducePacket(__shfl_down(reduced_val, offset, warpSize), &reduced_val);
+      }
+
+      if ((threadIdx.x & (warpSize - 1)) == 0) {
+        if (row + 1 < num_preserved_coeffs) {
+          atomicReduce(&(output[row]), reduced_val, reducer);
+        }
+        else {
+          atomicReduce(scratch, reduced_val, reducer);
+        }
+      }
+    }
+  }
+}
+*/
+#endif
+
 template <typename Self, typename Op>
 struct InnerReducer<Self, Op, GpuDevice> {
   // Unfortunately nvidia doesn't support well exotic types such as complex,