Add CUDA complex sqrt.

This is to support scalar `sqrt` of complex numbers `std::complex<T>` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex<T>)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang).
author: Antonio Sanchez <cantonios@google.com> 2020-12-22 22:49:06 -0800
committer: Antonio Sanchez <cantonios@google.com> 2020-12-22 23:25:23 -0800
commit: 070d303d56d46d2e018a58214da24ca629ea454f (patch)
tree: 3dfa72bf48ffdca0a67bd794596e4e452d50ed19 /unsupported
parent: fdf2ee62c5174441076fb64c9737d89bbe102759 (diff)
1 files changed, 7 insertions, 1 deletions
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
index f32ce27e9..cb53ce298 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -16,7 +16,7 @@
 // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
 // When compiling such files, gcc will end up trying to pick up the CUDA headers by 
 // default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
-// This will obsviously not work when trying to compile tensorflow on a system with no CUDA
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
 // To work around this issue for HIP systems (and leave the default behaviour intact), the
 // HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and 
 // "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
@@ -30,6 +30,9 @@
 #define gpuSuccess hipSuccess
 #define gpuErrorNotReady hipErrorNotReady
 #define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
 #define gpuGetErrorString hipGetErrorString
 #define gpuGetDeviceProperties hipGetDeviceProperties
 #define gpuStreamDefault hipStreamDefault
@@ -57,6 +60,9 @@
 #define gpuSuccess cudaSuccess
 #define gpuErrorNotReady cudaErrorNotReady
 #define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
 #define gpuGetErrorString cudaGetErrorString
 #define gpuGetDeviceProperties cudaGetDeviceProperties
 #define gpuStreamDefault cudaStreamDefault
author	Antonio Sanchez <cantonios@google.com>	2020-12-22 22:49:06 -0800
committer	Antonio Sanchez <cantonios@google.com>	2020-12-22 23:25:23 -0800
commit	070d303d56d46d2e018a58214da24ca629ea454f (patch)
tree	3dfa72bf48ffdca0a67bd794596e4e452d50ed19 /unsupported
parent	fdf2ee62c5174441076fb64c9737d89bbe102759 (diff)