diff options
3 files changed, 94 insertions, 6 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h index b33ab962e..88d485f38 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorContractionThreadPool.h @@ -15,7 +15,7 @@ namespace Eigen { -#ifndef EIGEN_USE_NONBLOCKING_THREAD_POOL +#ifdef EIGEN_USE_SIMPLE_THREAD_POOL namespace internal { template<typename LhsScalar, typename LhsMapper, typename Index> @@ -54,7 +54,7 @@ struct packRhsAndKernelArg { }; } // end namespace internal -#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL +#endif // EIGEN_USE_SIMPLE_THREAD_POOL template<typename Indices, typename LeftArgType, typename RightArgType> struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgType>, ThreadPoolDevice> : @@ -112,7 +112,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT TensorEvaluator(const XprType& op, const Device& device) : Base(op, device) {} -#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> void evalProduct(Scalar* buffer) const { @@ -731,7 +731,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT return 0; } -#else // EIGEN_USE_NONBLOCKING_THREAD_POOL +#else // EIGEN_USE_SIMPLE_THREAD_POOL template <bool lhs_inner_dim_contiguous, bool rhs_inner_dim_contiguous, bool rhs_inner_dim_reordered, int Alignment> void evalProduct(Scalar* buffer) const { @@ -1007,7 +1007,7 @@ struct TensorEvaluator<const TensorContractionOp<Indices, LeftArgType, RightArgT } } } -#endif // EIGEN_USE_NONBLOCKING_THREAD_POOL +#endif // EIGEN_USE_SIMPLE_THREAD_POOL TensorOpCost contractionCost(Index m, Index n, Index bm, Index bn, Index bk, bool shard_by_col, bool prepacked) const { diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h index 4df4cc220..5846a5e1b 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorDeviceThreadPool.h @@ -14,7 +14,7 @@ namespace Eigen { // Use the SimpleThreadPool by default. We'll switch to the new non blocking // thread pool later. -#ifdef EIGEN_USE_NONBLOCKING_THREAD_POOL +#ifndef EIGEN_USE_SIMPLE_THREAD_POOL template <typename Env> using ThreadPoolTempl = NonBlockingThreadPoolTempl<Env>; typedef NonBlockingThreadPool ThreadPool; #else diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h index 8c2baec14..63646dfc2 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h @@ -360,6 +360,94 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu } } +#ifdef EIGEN_HAS_CUDA_FP16 +/* +template <int NumPerThread, typename Self, + typename Reducer, typename Index> +__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, + half* output, half2* scratch) { + eigen_assert(blockDim.y == 1); + eigen_assert(blockDim.z == 1); + eigen_assert(gridDim.y == 1); + eigen_assert(gridDim.z == 1); + + const int unroll_times = 16; + eigen_assert(NumPerThread % unroll_times == 0); + eigen_assert(unroll_times % 2 == 0); + + const Index input_col_blocks = divup<Index>(num_coeffs_to_reduce, blockDim.x * NumPerThread); + const Index num_input_blocks = input_col_blocks * num_preserved_coeffs; + + const Index num_threads = blockDim.x * gridDim.x; + const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; + + // Initialize the output values if they weren't initialized by the ReductionInitKernel + if (gridDim.x == 1) { + Index i = thread_id; + for (; i < num_preserved_coeffs; i += 2*num_threads) { + ((half2*)output)[i] = reducer.initializePacket(); + } + if (i + 1 < num_preserved_coeffs) { + output[i] = reducer.initialize(); + } + __syncthreads(); + } + + for (Index i = blockIdx.x; i < num_input_blocks; i += gridDim.x) { + const Index row = i / input_col_blocks; + + if (row + 1 < num_preserved_coeffs) { + const Index col_block = i % input_col_blocks; + const Index col_begin = col_block * blockDim.x * NumPerThread + threadIdx.x; + + half2 reduced_val1 = reducer.initializePacket(); + half2 reduced_val2 = reducer.initializePacket(); + + for (Index j = 0; j < NumPerThread; j += unroll_times) { + const Index last_col = col_begin + blockDim.x * (j + unroll_times - 1); + if (last_col >= num_coeffs_to_reduce) { + Index col = col_begin + blockDim.x * j; + for (; col + 1 < num_coeffs_to_reduce; col += blockDim.x) { + const half2 val = input.m_impl.packet(row * num_coeffs_to_reduce + col); + reducer.reduce(val, &reduced_val); + // do the same for reduce val2 here + } + if (col < num_coeffs_to_reduce) { + // Peel; + const half last = input.m_impl.coeff(row * num_coeffs_to_reduce + col+1); + const half2 val = __halves2half2(last, reducer.initialize()); + reducer.reducePacket(val, &reduced_val); + } + break; + } else { + // Faster version of the loop with no branches after unrolling. +#pragma unroll + for (int k = 0; k < unroll_times; ++k) { + const Index col = col_begin + blockDim.x * (j + k); + reducer.reduce(input.m_impl.packet(row * num_coeffs_to_reduce + col), &reduced_val); + } + } + } + +#pragma unroll + for (int offset = warpSize/2; offset > 0; offset /= 2) { + reducer.reducePacket(__shfl_down(reduced_val, offset, warpSize), &reduced_val); + } + + if ((threadIdx.x & (warpSize - 1)) == 0) { + if (row + 1 < num_preserved_coeffs) { + atomicReduce(&(output[row]), reduced_val, reducer); + } + else { + atomicReduce(scratch, reduced_val, reducer); + } + } + } + } +} +*/ +#endif + template <typename Self, typename Op> struct InnerReducer<Self, Op, GpuDevice> { // Unfortunately nvidia doesn't support well exotic types such as complex, |