From 603e213d13311af286c8c1abd4ea14a8bd3d204e Mon Sep 17 00:00:00 2001
From: Deven Desai <deven.desai.amd@gmail.com>
Date: Wed, 19 Aug 2020 20:06:39 +0000
Subject: Fixing a CUDA / P100 regression introduced by PR 181

PR 181 ( https://gitlab.com/libeigen/eigen/-/merge_requests/181 ) adds `__launch_bounds__(1024)` attribute to GPU kernels, that did not have that attribute explicitly specified.

That PR seems to cause regressions on the CUDA platform. This PR/commit makes the changes in PR 181, to be applicable for HIP only
---
 .../Eigen/CXX11/src/Tensor/TensorReductionGpu.h        | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

(limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
index cfc49166a..02a514c0f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h
@@ -121,7 +121,7 @@ __device__ inline void atomicReduce(float* output, float accum, SumReducer<float
 
 
 template <typename CoeffType, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernel(const CoeffType val, Index num_preserved_coeffs, CoeffType* output) {
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const Index num_threads = blockDim.x * gridDim.x;
   for (Index i = thread_id; i < num_preserved_coeffs; i += num_threads) {
@@ -132,7 +132,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernel(const CoeffType val,
 
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernel(Reducer reducer, const Self input, Index num_coeffs,
                                     typename Self::CoeffReturnType* output, unsigned int* semaphore) {
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   // Initialize the output value
@@ -214,7 +214,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernel(Reducer reducer, con
 #ifdef EIGEN_HAS_GPU_FP16
 template <typename Self,
           typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitFullReduxKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
                                                       packet_traits<Eigen::half>::type* scratch) {
   eigen_assert(blockDim.x == 1);
   eigen_assert(gridDim.x == 1);
@@ -239,7 +239,7 @@ __global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(Re
 
 template <typename Self,
           typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionInitKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs, half* output) {
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   const Index num_threads = blockDim.x * gridDim.x;
   typedef typename packet_traits<Eigen::half>::type PacketType;
@@ -259,7 +259,7 @@ __global__ __launch_bounds__(1024) void ReductionInitKernelHalfFloat(Reducer red
 
 template <int BlockSize, int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void FullReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs,
                                     half* output, packet_traits<Eigen::half>::type* scratch) {
   typedef typename packet_traits<Eigen::half>::type PacketType;
   const int packet_width = unpacket_traits<PacketType>::size;
@@ -358,7 +358,7 @@ __global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(Reducer red
 }
 
 template <typename Op>
-__global__ __launch_bounds__(1024) void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ReductionCleanupKernelHalfFloat(Op reducer, half* output, packet_traits<Eigen::half>::type* scratch) {
   eigen_assert(threadIdx.x == 1);
   half2* pscratch = reinterpret_cast<half2*>(scratch);
   half tmp = __float2half(0.f);
@@ -476,7 +476,7 @@ struct FullReducer<Self, Op, GpuDevice, Vectorizable> {
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                          typename Self::CoeffReturnType* output) {
 #if (defined(EIGEN_HIP_DEVICE_COMPILE) && defined(__HIP_ARCH_HAS_WARP_SHUFFLE__)) || (EIGEN_CUDA_ARCH >= 300)
   typedef typename Self::CoeffReturnType Type;
@@ -561,7 +561,7 @@ __global__ __launch_bounds__(1024) void InnerReductionKernel(Reducer reducer, co
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void InnerReductionKernelHalfFloat(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                               half* output) {
   eigen_assert(blockDim.y == 1);
   eigen_assert(blockDim.z == 1);
@@ -868,7 +868,7 @@ struct InnerReducer<Self, Op, GpuDevice> {
 
 template <int NumPerThread, typename Self,
           typename Reducer, typename Index>
-__global__ __launch_bounds__(1024) void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
+__global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void OuterReductionKernel(Reducer reducer, const Self input, Index num_coeffs_to_reduce, Index num_preserved_coeffs,
                                      typename Self::CoeffReturnType* output) {
   const Index num_threads = blockDim.x * gridDim.x;
   const Index thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-- 
cgit v1.2.3