aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2020-08-19 20:06:39 +0000
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2020-08-20 00:29:57 +0000
commit603e213d13311af286c8c1abd4ea14a8bd3d204e (patch)
treefce713b0de190f4ee9d5be162a7efb83d0f8754c /unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
parentc060114a259af3460dc40b388df47c86944f2600 (diff)
Fixing a CUDA / P100 regression introduced by PR 181
PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified. That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h6
1 files changed, 3 insertions, 3 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 19a834d0e..df289e2c0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -578,7 +578,7 @@ struct GetKernelSize<Dynamic> {
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSize>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
indexMapper,
@@ -630,7 +630,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
indexMapper,
@@ -701,7 +701,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
};
template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel3D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
indexMapper,