aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h10
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h2
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h88
3 files changed, 94 insertions, 6 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
index b33ab962e..88d485f38 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h
@@ -15,7 +15,7 @@
namespace Eigen {
-#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifdef EIGEN_USE_SIMPLE_THREAD_POOL
namespace internal {
template<typename LhsScalar, typename LhsMapper, typename Index>
@@ -54,7 +54,7 @@ struct packRhsAndKernelArg {
};
} // end namespace internal
-#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#endif // EIGEN_USE_SIMPLE_THREAD_POOL
template<typename Indices, typename LeftArgType, typename RightArgType>
struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> :
@@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
TensorEvaluator(const XprType& op, const Device& device) :
Base(op, device) {}
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous,
bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
@@ -731,7 +731,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
return 0;
}
-#else // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#else // EIGEN_USE_SIMPLE_THREAD_POOL
template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment>
void evalProduct(Scalar* buffer) const {
@@ -1007,7 +1007,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT
}
}
}
-#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL
+#endif // EIGEN_USE_SIMPLE_THREAD_POOL
TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk,
bool shard_by_col, bool prepacked) const {
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
index 4df4cc220..5846a5e1b 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h
@@ -14,7 +14,7 @@ namespace Eigen {
// Use the SimpleThreadPool by default. We'll switch to the new non blocking
// thread pool later.
-#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL
+#ifndef EIGEN_USE_SIMPLE_THREAD_POOL
template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>;
typedef NonBlockingThreadPool ThreadPool;
#else
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
index 8c2baec14..63646dfc2 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
@@ -360,6 +360,94 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
}
}
+#ifdef EIGEN_HAS_CUDA_FP16
+/*
+template <int NumPerThread, typename Self,
+ typename Reducer, typename Index>
+__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+ half* output, half2* scratch) {
+ eigen_assert(blockDim.y == 1);
+ eigen_assert(blockDim.z == 1);
+ eigen_assert(gridDim.y == 1);
+ eigen_assert(gridDim.z == 1);
+
+ const int unroll_times = 16;
+ eigen_assert(NumPerThread % unroll_times == 0);
+ eigen_assert(unroll_times % 2 == 0);
+
+ const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread);
+ const Index num_input_blocks = input_col_blocks * num_preserved_coeffs;
+
+ const Index num_threads = blockDim.x * gridDim.x;
+ const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+
+ // Initialize the output values if they weren't initialized by the ReductionInitKernel
+ if (gridDim.x == 1) {
+ Index i = thread_id;
+ for (; i < num_preserved_coeffs; i += 2*num_threads) {
+ ((half2*)output)[i] = reducer.initializePacket();
+ }
+ if (i + 1 < num_preserved_coeffs) {
+ output[i] = reducer.initialize();
+ }
+ __syncthreads();
+ }
+
+ for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) {
+ const Index row = i / input_col_blocks;
+
+ if (row + 1 < num_preserved_coeffs) {
+ const Index col_block = i % input_col_blocks;
+ const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x;
+
+ half2 reduced_val1 = reducer.initializePacket();
+ half2 reduced_val2 = reducer.initializePacket();
+
+ for (Index j = 0; j < NumPerThread; j += unroll_times) {
+ const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1);
+ if (last_col >= num_coeffs_to_reduce) {
+ Index col = col_begin + blockDim.x * j;
+ for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) {
+ const half2 val = input.m_impl.packet(row * num_coeffs_to_reduce + col);
+ reducer.reduce(val, &reduced_val);
+ // do the same for reduce val2 here
+ }
+ if (col < num_coeffs_to_reduce) {
+ // Peel;
+ const half last = input.m_impl.coeff(row * num_coeffs_to_reduce + col+1);
+ const half2 val = __halves2half2(last, reducer.initialize());
+ reducer.reducePacket(val, &reduced_val);
+ }
+ break;
+ } else {
+ // Faster version of the loop with no branches after unrolling.
+#pragma unroll
+ for (int k = 0; k < unroll_times; ++k) {
+ const Index col = col_begin + blockDim.x * (j + k);
+ reducer.reduce(input.m_impl.packet(row * num_coeffs_to_reduce + col), &reduced_val);
+ }
+ }
+ }
+
+#pragma unroll
+ for (int offset = warpSize/2; offset > 0; offset /= 2) {
+ reducer.reducePacket(__shfl_down(reduced_val, offset, warpSize), &reduced_val);
+ }
+
+ if ((threadIdx.x & (warpSize - 1)) == 0) {
+ if (row + 1 < num_preserved_coeffs) {
+ atomicReduce(&(output[row]), reduced_val, reducer);
+ }
+ else {
+ atomicReduce(scratch, reduced_val, reducer);
+ }
+ }
+ }
+ }
+}
+*/
+#endif
+
template <typename Self, typename Op>
struct InnerReducer<Self, Op, GpuDevice> {
// Unfortunately nvidia doesn't support well exotic types such as complex,