aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
diff options
context:
space:
mode:
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h')
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h14
1 files changed, 7 insertions, 7 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
index af9b58816..9b0eb3e2f 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorReduction.h
@@ -15,7 +15,7 @@
// so we'll use a macro to make clang happy.
#ifndef KERNEL_FRIEND
#if defined(__clang__) && (defined(__CUDA__) || defined(__HIP__))
-#define KERNEL_FRIEND friend __global__
+#define KERNEL_FRIEND friend __global__ __launch_bounds__(1024)
#else
#define KERNEL_FRIEND friend
#endif
@@ -427,24 +427,24 @@ struct GenericReducer {
#if defined(EIGEN_USE_GPU) && (defined(EIGEN_GPUCC))
template <int B, int N, typename S, typename R, typename I_>
-__global__ void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
+__global__ __launch_bounds__(1024) void FullReductionKernel(R, const S, I_, typename S::CoeffReturnType*, unsigned int*);
#if defined(EIGEN_HAS_GPU_FP16)
template <typename S, typename R, typename I_>
-__global__ void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
+__global__ __launch_bounds__(1024) void ReductionInitFullReduxKernelHalfFloat(R, const S, I_, internal::packet_traits<half>::type*);
template <int B, int N, typename S, typename R, typename I_>
-__global__ void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
+__global__ __launch_bounds__(1024) void FullReductionKernelHalfFloat(R, const S, I_, half*, internal::packet_traits<half>::type*);
template <int NPT, typename S, typename R, typename I_>
-__global__ void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
+__global__ __launch_bounds__(1024) void InnerReductionKernelHalfFloat(R, const S, I_, I_, half*);
#endif
template <int NPT, typename S, typename R, typename I_>
-__global__ void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ __launch_bounds__(1024) void InnerReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
template <int NPT, typename S, typename R, typename I_>
-__global__ void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
+__global__ __launch_bounds__(1024) void OuterReductionKernel(R, const S, I_, I_, typename S::CoeffReturnType*);
#endif
/**