From 2859db0220cd8644c1a75a3bf04f62f551f73f22 Mon Sep 17 00:00:00 2001
From: Rohit Santhanam <rohit.santhanam@amd.com>
Date: Thu, 8 Apr 2021 15:14:48 +0000
Subject: This fixes an issue where the compiler was not choosing the GPU
 specific specialization of ScanLauncher.

The issue was discovered when the GPU scan unit test was run and resulted in a segmentation fault.

The segmantation fault occurred because the unit test allocated GPU memory and passed a pointer to that memory to the computation that it presumed would execute on the GPU.

But because of the issue, the computation was scheduled to execute on the CPU so a situation was constructed where the CPU attempted to access a GPU memory location.

The fix expands the GPU specific ScanLauncher specialization to handle cases where vectorization is enabled.

Previously, the GPU specialization is chosen only if Vectorization is not used.
---
 unsupported/Eigen/CXX11/src/Tensor/TensorScan.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'unsupported')
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
index 98c8250f0..a06c4a9f3 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorScan.h
@@ -357,8 +357,8 @@ __global__ EIGEN_HIP_LAUNCH_BOUNDS_1024 void ScanKernel(Self self, Index total_s
 
 }
 
-template <typename Self, typename Reducer>
-struct ScanLauncher<Self, Reducer, GpuDevice, false> {
+template <typename Self, typename Reducer, bool Vectorize>
+struct ScanLauncher<Self, Reducer, GpuDevice, Vectorize> {
   void operator()(const Self& self, typename Self::CoeffReturnType* data) {
      Index total_size = internal::array_prod(self.dimensions());
      Index num_blocks = (total_size / self.size() + 63) / 64;
-- 
cgit v1.2.3