From 603e213d13311af286c8c1abd4ea14a8bd3d204e Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 19 Aug 2020 20:06:39 +0000 Subject: Fixing a CUDA / P100 regression introduced by PR 181 PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified. That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only --- unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h index 9b0eb3e2f..0a65591e6 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h @@ -15,7 +15,7 @@ // so we'll use a macro to make clang happy. #ifndef KERNEL_FRIEND #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__)) -#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024) +#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 #else #define KERNEL_FRIEND friend #endif @@ -427,24 +427,24 @@ struct GenericReducer { #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC)) template -__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*); #if defined(EIGEN_HAS_GPU_FP16) template -__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits::type*); template -__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits::type*); template -__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*); #endif template -__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); template -__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*); #endif /** -- cgit v1.2.3