From 603e213d13311af286c8c1abd4ea14a8bd3d204e Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 19 Aug 2020 20:06:39 +0000 Subject: Fixing a CUDA / P100 regression introduced by PR 181 PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified. That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only --- Eigen/src/Core/util/Macros.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'Eigen/src/Core/util/Macros.h') diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h index d0499a1c9..9472a7c90 100644 --- a/Eigen/src/Core/util/Macros.h +++ b/Eigen/src/Core/util/Macros.h @@ -440,8 +440,27 @@ // analogous to EIGEN_CUDA_ARCH, but for HIP #define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__ #endif + + // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute + // value to 1024. The compiler assigns a default value of 256 when the attribute is not + // specified. This results in failures on the HIP platform, for cases when a GPU kernel + // without an explicit launch_bounds attribute is called with a threads_per_block value + // greater than 256. + // + // This is a regression in functioanlity and is expected to be fixed within the next + // couple of ROCm releases (compiler will go back to using 1024 value as the default) + // + // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds + // attribute. + + #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024) + #endif +#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) +#define EIGEN_HIP_LAUNCH_BOUNDS_1024 +#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024) + // Unify CUDA/HIPCC #if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC) -- cgit v1.2.3