diff options
Diffstat (limited to 'test')
-rw-r--r-- | test/CMakeLists.txt | 11 | ||||
-rw-r--r-- | test/gpu_basic.cu | 101 | ||||
-rw-r--r-- | test/gpu_common.h | 34 |
3 files changed, 133 insertions, 13 deletions
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8eda8d2f1..ce3782171 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -395,6 +395,12 @@ find_package(CUDA 5.0) if(CUDA_FOUND) set(CUDA_PROPAGATE_HOST_FLAGS OFF) + + set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") + if (${CUDA_VERSION} STREQUAL "7.0") + set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") + endif() + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) endif() @@ -404,7 +410,12 @@ if(CUDA_FOUND) foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}") endforeach() + else() + foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}") + endforeach() endif() + string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(gpu_basic) diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index e8069f185..b82b94d9b 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -14,7 +14,6 @@ #endif #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #include "main.h" @@ -55,6 +54,59 @@ struct coeff_wise { }; template<typename T> +struct complex_sqrt { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef typename T::Scalar ComplexType; + typedef typename T::Scalar::value_type ValueType; + const int num_special_inputs = 18; + + if (i == 0) { + const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN(); + typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs; + SpecialInputs special_in; + special_in.setZero(); + int idx = 0; + special_in[idx++] = ComplexType(0, 0); + special_in[idx++] = ComplexType(-0, 0); + special_in[idx++] = ComplexType(0, -0); + special_in[idx++] = ComplexType(-0, -0); + // GCC's fallback sqrt implementation fails for inf inputs. + // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if + // clang includes the GCC header (which temporarily disables + // _GLIBCXX_USE_C99_COMPLEX) + #if !defined(_GLIBCXX_COMPLEX) || \ + (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX)) + const ValueType inf = std::numeric_limits<ValueType>::infinity(); + special_in[idx++] = ComplexType(1.0, inf); + special_in[idx++] = ComplexType(nan, inf); + special_in[idx++] = ComplexType(1.0, -inf); + special_in[idx++] = ComplexType(nan, -inf); + special_in[idx++] = ComplexType(-inf, 1.0); + special_in[idx++] = ComplexType(inf, 1.0); + special_in[idx++] = ComplexType(-inf, -1.0); + special_in[idx++] = ComplexType(inf, -1.0); + special_in[idx++] = ComplexType(-inf, nan); + special_in[idx++] = ComplexType(inf, nan); + #endif + special_in[idx++] = ComplexType(1.0, nan); + special_in[idx++] = ComplexType(nan, 1.0); + special_in[idx++] = ComplexType(nan, -1.0); + special_in[idx++] = ComplexType(nan, nan); + + Map<SpecialInputs> special_out(out); + special_out = special_in.cwiseSqrt(); + } + + T x1(in + i); + Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime); + res = x1.cwiseSqrt(); + } +}; + +template<typename T> struct replicate { EIGEN_DEVICE_FUNC void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const @@ -161,17 +213,58 @@ struct matrix_inverse { } }; +template<typename Type1, typename Type2> +bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only +{ + if (a.rows() != b.rows()) { + return false; + } + if (a.cols() != b.cols()) { + return false; + } + for (Index r = 0; r < a.rows(); ++r) { + for (Index c = 0; c < a.cols(); ++c) { + if (a(r, c) != b(r, c) + && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) + && !test_isApprox(a(r, c), b(r, c))) { + return false; + } + } + } + return true; +} + +template<typename Kernel, typename Input, typename Output> +void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out) +{ + Output out_ref, out_gpu; + #if !defined(EIGEN_GPU_COMPILE_PHASE) + out_ref = out_gpu = out; + #else + EIGEN_UNUSED_VARIABLE(in); + EIGEN_UNUSED_VARIABLE(out); + #endif + run_on_cpu (ker, n, in, out_ref); + run_on_gpu(ker, n, in, out_gpu); + #if !defined(EIGEN_GPU_COMPILE_PHASE) + verifyIsApproxWithInfsNans(out_ref, out_gpu); + #endif +} + EIGEN_DECLARE_TEST(gpu_basic) { ei_test_init_gpu(); int nthreads = 100; Eigen::VectorXf in, out; + Eigen::VectorXcf cfin, cfout; - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) int data_size = nthreads * 512; in.setRandom(data_size); - out.setRandom(data_size); + out.setConstant(data_size, -1); + cfin.setRandom(data_size); + cfout.setConstant(data_size, -1); #endif CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) ); @@ -204,6 +297,8 @@ EIGEN_DECLARE_TEST(gpu_basic) CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) ); + CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) ); + #if defined(__NVCC__) // FIXME // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda diff --git a/test/gpu_common.h b/test/gpu_common.h index 049e7aade..fe0485e98 100644 --- a/test/gpu_common.h +++ b/test/gpu_common.h @@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out) #else run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out); #endif + // Pre-launch errors. + gpuError_t err = gpuGetLastError(); + if (err != gpuSuccess) { + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); + gpu_assert(false); + } + + // Kernel execution errors. + err = gpuDeviceSynchronize(); + if (err != gpuSuccess) { + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); + gpu_assert(false); + } - gpuDeviceSynchronize(); // check inputs have not been modified gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost); @@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o { Input in_ref, in_gpu; Output out_ref, out_gpu; - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) in_ref = in_gpu = in; out_ref = out_gpu = out; #else @@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o #endif run_on_cpu (ker, n, in_ref, out_ref); run_on_gpu(ker, n, in_gpu, out_gpu); - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) VERIFY_IS_APPROX(in_ref, in_gpu); VERIFY_IS_APPROX(out_ref, out_gpu); #endif @@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o struct compile_time_device_info { EIGEN_DEVICE_FUNC - void operator()(int /*i*/, const int* /*in*/, int* info) const + void operator()(int i, const int* /*in*/, int* info) const { - #if defined(__CUDA_ARCH__) - info[0] = int(__CUDA_ARCH__ +0); - #endif - #if defined(EIGEN_HIP_DEVICE_COMPILE) - info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); - #endif + if (i == 0) { + #if defined(__CUDA_ARCH__) + info[0] = int(__CUDA_ARCH__ +0); + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); + #endif + } } }; |