Fixing a CUDA / P100 regression introduced by PR 181

PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified. That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
author: Deven Desai <deven.desai.amd@gmail.com> 2020-08-19 20:06:39 +0000
committer: Deven Desai <deven.desai.amd@gmail.com> 2020-08-20 00:29:57 +0000
commit: 603e213d13311af286c8c1abd4ea14a8bd3d204e (patch)
tree: fce713b0de190f4ee9d5be162a7efb83d0f8754c /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parent: c060114a259af3460dc40b388df47c86944f2600 (diff)
1 files changed, 7 insertions, 7 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 9b0eb3e2f..0a65591e6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -15,7 +15,7 @@
 // so we'll use a macro to make clang happy.
 #ifndef KERNEL_FRIEND
 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024)
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
 #else
 #define KERNEL_FRIEND friend
 #endif
@@ -427,24 +427,24 @@ struct GenericReducer {
 
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 template <int B, int N, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
 
 
 #if defined(EIGEN_HAS_GPU_FP16)
 template <typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
 template <int B, int N, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
 template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
 
 #endif
 
 template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 
 template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
 /**
author	Deven Desai <deven.desai.amd@gmail.com>	2020-08-19 20:06:39 +0000
committer	Deven Desai <deven.desai.amd@gmail.com>	2020-08-20 00:29:57 +0000
commit	603e213d13311af286c8c1abd4ea14a8bd3d204e (patch)
tree	fce713b0de190f4ee9d5be162a7efb83d0f8754c /unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
parent	c060114a259af3460dc40b388df47c86944f2600 (diff)