From 070d303d56d46d2e018a58214da24ca629ea454f Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 22 Dec 2020 22:49:06 -0800 Subject: Add CUDA complex sqrt. This is to support scalar `sqrt` of complex numbers `std::complex` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang). --- unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'unsupported') diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h index f32ce27e9..cb53ce298 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -16,7 +16,7 @@ // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler // When compiling such files, gcc will end up trying to pick up the CUDA headers by // default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) -// This will obsviously not work when trying to compile tensorflow on a system with no CUDA +// This will obviously not work when trying to compile tensorflow on a system with no CUDA // To work around this issue for HIP systems (and leave the default behaviour intact), the // HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and // "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is @@ -30,6 +30,9 @@ #define gpuSuccess hipSuccess #define gpuErrorNotReady hipErrorNotReady #define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetLastError hipGetLastError +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorName hipGetErrorName #define gpuGetErrorString hipGetErrorString #define gpuGetDeviceProperties hipGetDeviceProperties #define gpuStreamDefault hipStreamDefault @@ -57,6 +60,9 @@ #define gpuSuccess cudaSuccess #define gpuErrorNotReady cudaErrorNotReady #define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetLastError cudaGetLastError +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorName cudaGetErrorName #define gpuGetErrorString cudaGetErrorString #define gpuGetDeviceProperties cudaGetDeviceProperties #define gpuStreamDefault cudaStreamDefault -- cgit v1.2.3