diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 18 |
1 files changed, 9 insertions, 9 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index 36df03d62..cfc49166a 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -121,7 +121,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float template <typename CoeffType, typename Index> -__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { +__global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index num_threads = blockDim.x * gridDim.x; for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { @@ -132,7 +132,7 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, +__global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output, unsigned int* semaphore) { #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) // Initialize the output value @@ -214,7 +214,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num #ifdef EIGEN_HAS_GPU_FP16 template <typename Self, typename Reducer, typename Index> -__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, +__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, packet_traits<Eigen::half>::type* scratch) { eigen_assert(blockDim.x == 1); eigen_assert(gridDim.x == 1); @@ -239,7 +239,7 @@ __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Sel template <typename Self, typename Reducer, typename Index> -__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { +__global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index num_threads = blockDim.x * gridDim.x; typedef typename packet_traits<Eigen::half>::type PacketType; @@ -259,7 +259,7 @@ __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, +__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output, packet_traits<Eigen::half>::type* scratch) { typedef typename packet_traits<Eigen::half>::type PacketType; const int packet_width = unpacket_traits<PacketType>::size; @@ -358,7 +358,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, } template <typename Op> -__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) { +__global__ __launch_bounds__(1024) void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) { eigen_assert(threadIdx.x == 1); half2* pscratch = reinterpret_cast<half2*>(scratch); half tmp = __float2half(0.f); @@ -476,7 +476,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { template <int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, +__global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) typedef typename Self::CoeffReturnType Type; @@ -561,7 +561,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu template <int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, +__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, half* output) { eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); @@ -868,7 +868,7 @@ struct InnerReducer<Self, Op, GpuDevice> { template <int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, +__global__ __launch_bounds__(1024) void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; |