From 46f8a18567731925e06a7389a6c611e1dc420ea8 Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 5 Aug 2020 01:46:34 +0000
Subject: Adding an explicit launch_bounds(1024) attribute for GPU kernels.

Starting with ROCm 3.5, the HIP compiler will change from HCC to hip-clang.

This compiler change introduce a change in the default value of the `__launch_bounds__` attribute associated with a GPU kernel. (default value means the value assumed by the compiler as the `__launch_bounds attribute__` value, when it is not explicitly specified by the user)

Currently (i.e. for HIP with ROCm 3.3 and older), the default value is 1024. That changes to 256 with ROCm 3.5 (i.e. hip-clang compiler). As a consequence of this change, if a GPU kernel with a `__luanch_bounds__` attribute of 256 is launched at runtime with a threads_per_block value > 256, it leads to a runtime error. This is leading to a couple of Eigen unit test failures with ROCm 3.5.

This commit adds an explicit `__launch_bounds(1024)__` attribute to every GPU kernel that currently does not have it explicitly specified (and hence will end up getting the default value of 256 with the change to hip-clang)
---
 unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h |  6 +++---
 unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h   | 14 +++++++-------
 .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h        | 18 +++++++++---------
 unsupported/Eigen/CXX11/src/Tensor/TensorScan.h        |  2 +-
 4 files changed, 20 insertions(+), 20 deletions(-)

(limited to 'unsupported/Eigen/CXX11')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
index 27ad9f147..19a834d0e 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorConvolution.h
@@ -578,7 +578,7 @@ struct GetKernelSize<Dynamic> {
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSize>
-__global__ void EigenConvolutionKernel1D(
+__global__ __launch_bounds__(1024) void EigenConvolutionKernel1D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 1, InputEvaluator::Layout>
         indexMapper,
@@ -630,7 +630,7 @@ __global__ void EigenConvolutionKernel1D(
 
 template <typename InputEvaluator, typename Index, typename InputDims,
           int StaticKernelSizeX, int StaticKernelSizeY>
-__global__ void EigenConvolutionKernel2D(
+__global__ __launch_bounds__(1024) void EigenConvolutionKernel2D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 2, InputEvaluator::Layout>
         indexMapper,
@@ -701,7 +701,7 @@ __global__ void EigenConvolutionKernel2D(
 };
 
 template <typename InputEvaluator, typename Index, typename InputDims>
-__global__ void EigenConvolutionKernel3D(
+__global__ __launch_bounds__(1024) void EigenConvolutionKernel3D(
     InputEvaluator eval,
     const internal::IndexMapper<Index, InputDims, 3, InputEvaluator::Layout>
         indexMapper,
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index af9b58816..9b0eb3e2f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -15,7 +15,7 @@
 // so we'll use a macro to make clang happy.
 #ifndef KERNEL_FRIEND
 #if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-#define KERNEL_FRIEND friend __global__
+#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024)
 #else
 #define KERNEL_FRIEND friend
 #endif
@@ -427,24 +427,24 @@ struct GenericReducer {
 
 #if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
 template <int B, int N, typename S, typename R, typename I_>
-__global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
 
 
 #if defined(EIGEN_HAS_GPU_FP16)
 template <typename S, typename R, typename I_>
-__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
 template <int B, int N, typename S, typename R, typename I_>
-__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
 template <int NPT, typename S, typename R, typename I_>
-__global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
 
 #endif
 
 template <int NPT, typename S, typename R, typename I_>
-__global__ void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 
 template <int NPT, typename S, typename R, typename I_>
-__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
 #endif
 
 /**
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index 36df03d62..cfc49166a 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -121,7 +121,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
 
 
 template <typename CoeffType, typename Index>
-__global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+__global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const Index num_threads = blockDim.x * gridDim.x;
   for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
@@ -132,7 +132,7 @@ __global__ void ReductionInitKernel(const CoeffType val, Index num_preserved_coe
 
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+__global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                     typename Self::CoeffReturnType* output, unsigned int* semaphore) {
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   // Initialize the output value
@@ -214,7 +214,7 @@ __global__ void FullReductionKernel(Reducer reducer, const Self input, Index num
 #ifdef EIGEN_HAS_GPU_FP16
 template <typename Self,
           typename Reducer, typename Index>
-__global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
                                                       packet_traits<Eigen::half>::type* scratch) {
   eigen_assert(blockDim.x == 1);
   eigen_assert(gridDim.x == 1);
@@ -239,7 +239,7 @@ __global__ void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Sel
 
 template <typename Self,
           typename Reducer, typename Index>
-__global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+__global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const Index num_threads = blockDim.x * gridDim.x;
   typedef typename packet_traits<Eigen::half>::type PacketType;
@@ -259,7 +259,7 @@ __global__ void ReductionInitKernelHalfFloat(Reducer reducer, const Self input,
 
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
                                     half* output, packet_traits<Eigen::half>::type* scratch) {
   typedef typename packet_traits<Eigen::half>::type PacketType;
   const int packet_width = unpacket_traits<PacketType>::size;
@@ -358,7 +358,7 @@ __global__ void FullReductionKernelHalfFloat(Reducer reducer, const Self input,
 }
 
 template <typename Op>
-__global__ void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+__global__ __launch_bounds__(1024) void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
   eigen_assert(threadIdx.x == 1);
   half2* pscratch = reinterpret_cast<half2*>(scratch);
   half tmp = __float2half(0.f);
@@ -476,7 +476,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                          typename Self::CoeffReturnType* output) {
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   typedef typename Self::CoeffReturnType Type;
@@ -561,7 +561,7 @@ __global__ void InnerReductionKernel(Reducer reducer, const Self input, Index nu
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                               half* output) {
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
@@ -868,7 +868,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ __launch_bounds__(1024) void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                      typename Self::CoeffReturnType* output) {
   const Index num_threads = blockDim.x * gridDim.x;
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index bef8d261f..9e3b1a0b9 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -334,7 +334,7 @@ struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> {
 // parallel, but it would be better to use a parallel scan algorithm and
 // optimize memory access.
 template <typename Self, typename Reducer>
-__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
+__global__ __launch_bounds__(1024) void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) {
   // Compute offset as in the CPU version
   Index val = threadIdx.x + blockIdx.x * blockDim.x;
   Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride();
-- 
cgit v1.2.3