aboutsummaryrefslogtreecommitdiffhomepage
path: root/test/gpu_common.h
diff options
context:
space:
mode:
authorGravatar Antonio Sanchez <cantonios@google.com>2020-12-22 22:49:06 -0800
committerGravatar Antonio Sanchez <cantonios@google.com>2020-12-22 23:25:23 -0800
commit070d303d56d46d2e018a58214da24ca629ea454f (patch)
tree3dfa72bf48ffdca0a67bd794596e4e452d50ed19 /test/gpu_common.h
parentfdf2ee62c5174441076fb64c9737d89bbe102759 (diff)
Add CUDA complex sqrt.
This is to support scalar `sqrt` of complex numbers `std::complex<T>` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex<T>)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang).
Diffstat (limited to 'test/gpu_common.h')
-rw-r--r--test/gpu_common.h34
1 files changed, 24 insertions, 10 deletions
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 049e7aade..fe0485e98 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
#else
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
#endif
+ // Pre-launch errors.
+ gpuError_t err = gpuGetLastError();
+ if (err != gpuSuccess) {
+ printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+ gpu_assert(false);
+ }
+
+ // Kernel execution errors.
+ err = gpuDeviceSynchronize();
+ if (err != gpuSuccess) {
+ printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+ gpu_assert(false);
+ }
- gpuDeviceSynchronize();
// check inputs have not been modified
gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
@@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
{
Input in_ref, in_gpu;
Output out_ref, out_gpu;
- #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+ #if !defined(EIGEN_GPU_COMPILE_PHASE)
in_ref = in_gpu = in;
out_ref = out_gpu = out;
#else
@@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
#endif
run_on_cpu (ker, n, in_ref, out_ref);
run_on_gpu(ker, n, in_gpu, out_gpu);
- #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+ #if !defined(EIGEN_GPU_COMPILE_PHASE)
VERIFY_IS_APPROX(in_ref, in_gpu);
VERIFY_IS_APPROX(out_ref, out_gpu);
#endif
@@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
struct compile_time_device_info {
EIGEN_DEVICE_FUNC
- void operator()(int /*i*/, const int* /*in*/, int* info) const
+ void operator()(int i, const int* /*in*/, int* info) const
{
- #if defined(__CUDA_ARCH__)
- info[0] = int(__CUDA_ARCH__ +0);
- #endif
- #if defined(EIGEN_HIP_DEVICE_COMPILE)
- info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
- #endif
+ if (i == 0) {
+ #if defined(__CUDA_ARCH__)
+ info[0] = int(__CUDA_ARCH__ +0);
+ #endif
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+ #endif
+ }
}
};