diff options
-rw-r--r-- | Eigen/src/Core/util/Macros.h | 19 | ||||
-rw-r--r-- | test/gpu_common.h | 2 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 6 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 14 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h | 18 | ||||
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 2 |
6 files changed, 40 insertions, 21 deletions
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index d0499a1c9..9472a7c90 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -440,8 +440,27 @@ // analogous to EIGEN_CUDA_ARCH, but for HIP #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ #endif + + // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute + // value to 1024. The compiler assigns a default value of 256 when the attribute is not + // specified. This results in failures on the HIP platform, for cases when a GPU kernel + // without an explicit launch_bounds attribute is called with a threads_per_block value + // greater than 256. + // + // This is a regression in functioanlity and is expected to be fixed within the next + // couple of ROCm releases (compiler will go back to using 1024 value as the default) + // + // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds + // attribute. + + #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024) + #endif +#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) +#define EIGEN_HIP_LAUNCH_BOUNDS_1024 +#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) + // Unify CUDA/HIPCC #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) diff --git a/test/gpu_common.h b/test/gpu_common.h index 509be5942..049e7aade 100644 --- a/test/gpu_common.h +++ b/test/gpu_common.h @@ -29,7 +29,7 @@ void run_on_cpu(const Kernel& ker, int n, const Input& in, Output& out) template<typename Kernel, typename Input, typename Output> __global__ -__launch_bounds__(1024) +EIGEN_HIP_LAUNCH_BOUNDS_1024 void run_on_gpu_meta_kernel(const Kernel ker, int n, const Input* in, Output* out) { int i = threadIdx.x + blockIdx.x*blockDim.x; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 19a834d0e..df289e2c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -578,7 +578,7 @@ struct GetKernelSize<Dynamic> { template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize> -__global__ __launch_bounds__(1024) void EigenConvolutionKernel1D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper, @@ -630,7 +630,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel1D( template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY> -__global__ __launch_bounds__(1024) void EigenConvolutionKernel2D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper, @@ -701,7 +701,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel2D( }; template <typename InputEvaluator, typename Index, typename InputDims> -__global__ __launch_bounds__(1024) void EigenConvolutionKernel3D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper, diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 9b0eb3e2f..0a65591e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -15,7 +15,7 @@ // so we'll use a macro to make clang happy. #ifndef KERNEL_FRIEND #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) -#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024) +#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 #else #define KERNEL_FRIEND friend #endif @@ -427,24 +427,24 @@ struct GenericReducer { #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template <int B, int N, typename S, typename R, typename I_> -__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); #if defined(EIGEN_HAS_GPU_FP16) template <typename S, typename R, typename I_> -__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*); template <int B, int N, typename S, typename R, typename I_> -__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*); template <int NPT, typename S, typename R, typename I_> -__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif template <int NPT, typename S, typename R, typename I_> -__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); template <int NPT, typename S, typename R, typename I_> -__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif /** diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h index cfc49166a..02a514c0f 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h @@ -121,7 +121,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float template <typename CoeffType, typename Index> -__global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) { const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index num_threads = blockDim.x * gridDim.x; for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) { @@ -132,7 +132,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs, typename Self::CoeffReturnType* output, unsigned int* semaphore) { #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) // Initialize the output value @@ -214,7 +214,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, con #ifdef EIGEN_HAS_GPU_FP16 template <typename Self, typename Reducer, typename Index> -__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, packet_traits<Eigen::half>::type* scratch) { eigen_assert(blockDim.x == 1); eigen_assert(gridDim.x == 1); @@ -239,7 +239,7 @@ __global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Re template <typename Self, typename Reducer, typename Index> -__global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) { const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; const Index num_threads = blockDim.x * gridDim.x; typedef typename packet_traits<Eigen::half>::type PacketType; @@ -259,7 +259,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer red template <int BlockSize, int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output, packet_traits<Eigen::half>::type* scratch) { typedef typename packet_traits<Eigen::half>::type PacketType; const int packet_width = unpacket_traits<PacketType>::size; @@ -358,7 +358,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer red } template <typename Op> -__global__ __launch_bounds__(1024) void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) { eigen_assert(threadIdx.x == 1); half2* pscratch = reinterpret_cast<half2*>(scratch); half tmp = __float2half(0.f); @@ -476,7 +476,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> { template <int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300) typedef typename Self::CoeffReturnType Type; @@ -561,7 +561,7 @@ __global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, co template <int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, half* output) { eigen_assert(blockDim.y == 1); eigen_assert(blockDim.z == 1); @@ -868,7 +868,7 @@ struct InnerReducer<Self, Op, GpuDevice> { template <int NumPerThread, typename Self, typename Reducer, typename Index> -__global__ __launch_bounds__(1024) void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs, typename Self::CoeffReturnType* output) { const Index num_threads = blockDim.x * gridDim.x; const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index 9e3b1a0b9..98c8250f0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -334,7 +334,7 @@ struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> { // parallel, but it would be better to use a parallel scan algorithm and // optimize memory access. template <typename Self, typename Reducer> -__global__ __launch_bounds__(1024) void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { // Compute offset as in the CPU version Index val = threadIdx.x + blockIdx.x * blockDim.x; Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); |