diff options
Diffstat (limited to 'unsupported/Eigen/CXX11/src/Tensor/TensorScan.h')
-rw-r--r-- | unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 2 |
1 files changed, 1 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h index bef8d261f..9e3b1a0b9 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h @@ -334,7 +334,7 @@ struct ScanLauncher<Self, Reducer, ThreadPoolDevice, Vectorize> { // parallel, but it would be better to use a parallel scan algorithm and // optimize memory access. template <typename Self, typename Reducer> -__global__ void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { +__global__ __launch_bounds__(1024) void ScanKernel(Self self, Index total_size, typename Self::CoeffReturnType* data) { // Compute offset as in the CPU version Index val = threadIdx.x + blockIdx.x * blockDim.x; Index offset = (val / self.stride()) * self.stride() * self.size() + val % self.stride(); |