aboutsummaryrefslogtreecommitdiffhomepage
path: root/Eigen/src/Core/util/Macros.h
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2020-08-19 20:06:39 +0000
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2020-08-20 00:29:57 +0000
commit603e213d13311af286c8c1abd4ea14a8bd3d204e (patch)
treefce713b0de190f4ee9d5be162a7efb83d0f8754c /Eigen/src/Core/util/Macros.h
parentc060114a259af3460dc40b388df47c86944f2600 (diff)
Fixing a CUDA / P100 regression introduced by PR 181
PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified. That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
Diffstat (limited to 'Eigen/src/Core/util/Macros.h')
-rw-r--r--Eigen/src/Core/util/Macros.h19
1 files changed, 19 insertions, 0 deletions
diff --git a/Eigen/src/Core/util/Macros.h b/Eigen/src/Core/util/Macros.h
index d0499a1c9..9472a7c90 100644
--- a/Eigen/src/Core/util/Macros.h
+++ b/Eigen/src/Core/util/Macros.h
@@ -440,8 +440,27 @@
// analogous to EIGEN_CUDA_ARCH, but for HIP
#define EIGEN_HIP_DEVICE_COMPILE __HIP_DEVICE_COMPILE__
#endif
+
+ // For HIP (ROCm 3.5 and higher), we need to explicitly set the launch_bounds attribute
+ // value to 1024. The compiler assigns a default value of 256 when the attribute is not
+ // specified. This results in failures on the HIP platform, for cases when a GPU kernel
+ // without an explicit launch_bounds attribute is called with a threads_per_block value
+ // greater than 256.
+ //
+ // This is a regression in functioanlity and is expected to be fixed within the next
+ // couple of ROCm releases (compiler will go back to using 1024 value as the default)
+ //
+ // In the meantime, we will use a "only enabled for HIP" macro to set the launch_bounds
+ // attribute.
+
+ #define EIGEN_HIP_LAUNCH_BOUNDS_1024 __launch_bounds__(1024)
+
#endif
+#if !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+#define EIGEN_HIP_LAUNCH_BOUNDS_1024
+#endif // !defined(EIGEN_HIP_LAUNCH_BOUNDS_1024)
+
// Unify CUDA/HIPCC
#if defined(EIGEN_CUDACC) || defined(EIGEN_HIPCC)