Add CUDA complex sqrt.

This is to support scalar `sqrt` of complex numbers `std::complex<T>` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex<T>)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang).
author: Antonio Sanchez <cantonios@google.com> 2020-12-22 22:49:06 -0800
committer: Antonio Sanchez <cantonios@google.com> 2020-12-22 23:25:23 -0800
commit: 070d303d56d46d2e018a58214da24ca629ea454f (patch)
tree: 3dfa72bf48ffdca0a67bd794596e4e452d50ed19 /test/gpu_common.h
parent: fdf2ee62c5174441076fb64c9737d89bbe102759 (diff)
1 files changed, 24 insertions, 10 deletions
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 049e7aade..fe0485e98 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
 #else
   run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
 #endif
+  // Pre-launch errors.
+  gpuError_t err = gpuGetLastError();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
+  
+  // Kernel execution errors.
+  err = gpuDeviceSynchronize();
+  if (err != gpuSuccess) {
+    printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+    gpu_assert(false);
+  }
   
-  gpuDeviceSynchronize();
   
   // check inputs have not been modified
   gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()),  d_in,  in_bytes,  gpuMemcpyDeviceToHost);
@@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
 {
   Input  in_ref,  in_gpu;
   Output out_ref, out_gpu;
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
   in_ref = in_gpu = in;
   out_ref = out_gpu = out;
   #else
@@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
   #endif
   run_on_cpu (ker, n, in_ref,  out_ref);
   run_on_gpu(ker, n, in_gpu, out_gpu);
-  #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+  #if !defined(EIGEN_GPU_COMPILE_PHASE)
   VERIFY_IS_APPROX(in_ref, in_gpu);
   VERIFY_IS_APPROX(out_ref, out_gpu);
   #endif
@@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
 
 struct compile_time_device_info {
   EIGEN_DEVICE_FUNC
-  void operator()(int /*i*/, const int* /*in*/, int* info) const
+  void operator()(int i, const int* /*in*/, int* info) const
   {
-    #if defined(__CUDA_ARCH__)
-    info[0] = int(__CUDA_ARCH__ +0);
-    #endif
-    #if defined(EIGEN_HIP_DEVICE_COMPILE)
-    info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
-    #endif
+    if (i == 0) {
+      #if defined(__CUDA_ARCH__)
+      info[0] = int(__CUDA_ARCH__ +0);
+      #endif
+      #if defined(EIGEN_HIP_DEVICE_COMPILE)
+      info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+      #endif
+    }
   }
 };
author	Antonio Sanchez <cantonios@google.com>	2020-12-22 22:49:06 -0800
committer	Antonio Sanchez <cantonios@google.com>	2020-12-22 23:25:23 -0800
commit	070d303d56d46d2e018a58214da24ca629ea454f (patch)
tree	3dfa72bf48ffdca0a67bd794596e4e452d50ed19 /test/gpu_common.h
parent	fdf2ee62c5174441076fb64c9737d89bbe102759 (diff)