aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2020-08-19 20:06:39 +0000
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2020-08-20 00:29:57 +0000
commit603e213d13311af286c8c1abd4ea14a8bd3d204e (patch)
treefce713b0de190f4ee9d5be162a7efb83d0f8754c /unsupported
parentc060114a259af3460dc40b388df47c86944f2600 (diff)
Fixing a CUDA / P100 regression introduced by PR 181
PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified. That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
Diffstat (limited to 'unsupported')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h6
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h14
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h18
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorScan.h2
4 files changed, 20 insertions, 20 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 19a834d0e..df289e2c0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -578,7 +578,7 @@ struct GetKernelSize<Dynamic> {
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSize>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel1D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
indexMapper,
@@ -630,7 +630,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(
template <typename InputEvaluator, typename Index, typename InputDims,
int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel2D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
indexMapper,
@@ -701,7 +701,7 @@ __global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
};
template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ __launch_bounds__(1024) void EigenConvolutionKernel3D(
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void EigenConvolutionKernel3D(
InputEvaluator eval,
const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
indexMapper,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index 9b0eb3e2f..0a65591e6 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -15,7 +15,7 @@
// so we'll use a macro to make clang happy.
#ifndef KERNEL_FRIEND
#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024)
+#define KERNEL_FRIEND friend __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024
#else
#define KERNEL_FRIEND friend
#endif
@@ -427,24 +427,24 @@ struct GenericReducer {
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
template <int B, int N, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif
template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I_>
-__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
/**
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index cfc49166a..02a514c0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -121,7 +121,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
template <typename CoeffType, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const Index num_threads = blockDim.x * gridDim.x;
for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
@@ -132,7 +132,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val,
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
typename Self::CoeffReturnType* output, unsigned int* semaphore) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
// Initialize the output value
@@ -214,7 +214,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, con
#ifdef EIGEN_HAS_GPU_FP16
template <typename Self,
typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
packet_traits<Eigen::half>::type* scratch) {
eigen_assert(blockDim.x == 1);
eigen_assert(gridDim.x == 1);
@@ -239,7 +239,7 @@ __global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Re
template <typename Self,
typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
const Index num_threads = blockDim.x * gridDim.x;
typedef typename packet_traits<Eigen::half>::type PacketType;
@@ -259,7 +259,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer red
template <int BlockSize, int NumPerThread, typename Self,
typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
half* output, packet_traits<Eigen::half>::type* scratch) {
typedef typename packet_traits<Eigen::half>::type PacketType;
const int packet_width = unpacket_traits<PacketType>::size;
@@ -358,7 +358,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer red
}
template <typename Op>
-__global__ __launch_bounds__(1024) void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
eigen_assert(threadIdx.x == 1);
half2* pscratch = reinterpret_cast<half2*>(scratch);
half tmp = __float2half(0.f);
@@ -476,7 +476,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
#if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
typedef typename Self::CoeffReturnType Type;
@@ -561,7 +561,7 @@ __global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, co
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
half* output) {
eigen_assert(blockDim.y == 1);
eigen_assert(blockDim.z == 1);
@@ -868,7 +868,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
template <int NumPerThread, typename Self,
typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
typename Self::CoeffReturnType* output) {
const Index num_threads = blockDim.x * gridDim.x;
const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 9e3b1a0b9..98c8250f0 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -334,7 +334,7 @@ struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
// parallel, but it would be better to use a parallel scan algorithm and
// optimize memory access.
template <typename Self, typename Reducer>
-__global__ __launch_bounds__(1024) void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
// Compute offset as in the CPU version
Index val = threadIdx.x + blockIdx.x * blockDim.x;
Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();