diff options
author | Deven Desai <deven.desai.amd@gmail.com> | 2020-08-19 20:06:39 +0000 |
---|---|---|
committer | Deven Desai <deven.desai.amd@gmail.com> | 2020-08-20 00:29:57 +0000 |
commit | 603e213d13311af286c8c1abd4ea14a8bd3d204e (patch) | |
tree | fce713b0de190f4ee9d5be162a7efb83d0f8754c /unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | |
parent | c060114a259af3460dc40b388df47c86944f2600 (diff) |
Fixing a CUDA / P100 regression introduced by PR 181
PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified.
That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h | 6 |
1 files changed, 3 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h index 19a834d0e..df289e2c0 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h @@ -578,7 +578,7 @@ struct GetKernelSize<Dynamic> { template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSize> -__global__ __launch_bounds__(1024) void EigenConvolutionKernel1D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D( InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout> indexMapper, @@ -630,7 +630,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel1D( template <typename InputEvaluator, typename Index, typename InputDims, int StaticKernelSizeX, int StaticKernelSizeY> -__global__ __launch_bounds__(1024) void EigenConvolutionKernel2D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D( InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout> indexMapper, @@ -701,7 +701,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel2D( }; template <typename InputEvaluator, typename Index, typename InputDims> -__global__ __launch_bounds__(1024) void EigenConvolutionKernel3D( +__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D( InputEvaluator eval, const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout> indexMapper, |