diff options
author | Antonio Sanchez <cantonios@google.com> | 2020-12-22 22:49:06 -0800 |
---|---|---|
committer | Antonio Sanchez <cantonios@google.com> | 2020-12-22 23:25:23 -0800 |
commit | 070d303d56d46d2e018a58214da24ca629ea454f (patch) | |
tree | 3dfa72bf48ffdca0a67bd794596e4e452d50ed19 /test/gpu_common.h | |
parent | fdf2ee62c5174441076fb64c9737d89bbe102759 (diff) |
Add CUDA complex sqrt.
This is to support scalar `sqrt` of complex numbers `std::complex<T>` on
device, requested by Tensorflow folks.
Technically `std::complex` is not supported by NVCC on device
(though it is by clang), so the default `sqrt(std::complex<T>)` function only
works on the host. Here we create an overload to add back the
functionality.
Also modified the CMake file to add `--relaxed-constexpr` (or
equivalent) flag for NVCC to allow calling constexpr functions from
device functions, and added support for specifying compute architecture for
NVCC (was already available for clang).
Diffstat (limited to 'test/gpu_common.h')
-rw-r--r-- | test/gpu_common.h | 34 |
1 files changed, 24 insertions, 10 deletions
diff --git a/test/gpu_common.h b/test/gpu_common.h index 049e7aade..fe0485e98 100644 --- a/test/gpu_common.h +++ b/test/gpu_common.h @@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out) #else run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out); #endif + // Pre-launch errors. + gpuError_t err = gpuGetLastError(); + if (err != gpuSuccess) { + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); + gpu_assert(false); + } + + // Kernel execution errors. + err = gpuDeviceSynchronize(); + if (err != gpuSuccess) { + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); + gpu_assert(false); + } - gpuDeviceSynchronize(); // check inputs have not been modified gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost); @@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o { Input in_ref, in_gpu; Output out_ref, out_gpu; - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) in_ref = in_gpu = in; out_ref = out_gpu = out; #else @@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o #endif run_on_cpu (ker, n, in_ref, out_ref); run_on_gpu(ker, n, in_gpu, out_gpu); - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) VERIFY_IS_APPROX(in_ref, in_gpu); VERIFY_IS_APPROX(out_ref, out_gpu); #endif @@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o struct compile_time_device_info { EIGEN_DEVICE_FUNC - void operator()(int /*i*/, const int* /*in*/, int* info) const + void operator()(int i, const int* /*in*/, int* info) const { - #if defined(__CUDA_ARCH__) - info[0] = int(__CUDA_ARCH__ +0); - #endif - #if defined(EIGEN_HIP_DEVICE_COMPILE) - info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); - #endif + if (i == 0) { + #if defined(__CUDA_ARCH__) + info[0] = int(__CUDA_ARCH__ +0); + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); + #endif + } } }; |