From 070d303d56d46d2e018a58214da24ca629ea454f Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 22 Dec 2020 22:49:06 -0800 Subject: Add CUDA complex sqrt. This is to support scalar `sqrt` of complex numbers `std::complex` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang). --- test/gpu_basic.cu | 101 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 98 insertions(+), 3 deletions(-) (limited to 'test/gpu_basic.cu') diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index e8069f185..b82b94d9b 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -14,7 +14,6 @@ #endif #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #include "main.h" @@ -54,6 +53,59 @@ struct coeff_wise { } }; +template +struct complex_sqrt { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef typename T::Scalar ComplexType; + typedef typename T::Scalar::value_type ValueType; + const int num_special_inputs = 18; + + if (i == 0) { + const ValueType nan = std::numeric_limits::quiet_NaN(); + typedef Eigen::Vector SpecialInputs; + SpecialInputs special_in; + special_in.setZero(); + int idx = 0; + special_in[idx++] = ComplexType(0, 0); + special_in[idx++] = ComplexType(-0, 0); + special_in[idx++] = ComplexType(0, -0); + special_in[idx++] = ComplexType(-0, -0); + // GCC's fallback sqrt implementation fails for inf inputs. + // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if + // clang includes the GCC header (which temporarily disables + // _GLIBCXX_USE_C99_COMPLEX) + #if !defined(_GLIBCXX_COMPLEX) || \ + (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX)) + const ValueType inf = std::numeric_limits::infinity(); + special_in[idx++] = ComplexType(1.0, inf); + special_in[idx++] = ComplexType(nan, inf); + special_in[idx++] = ComplexType(1.0, -inf); + special_in[idx++] = ComplexType(nan, -inf); + special_in[idx++] = ComplexType(-inf, 1.0); + special_in[idx++] = ComplexType(inf, 1.0); + special_in[idx++] = ComplexType(-inf, -1.0); + special_in[idx++] = ComplexType(inf, -1.0); + special_in[idx++] = ComplexType(-inf, nan); + special_in[idx++] = ComplexType(inf, nan); + #endif + special_in[idx++] = ComplexType(1.0, nan); + special_in[idx++] = ComplexType(nan, 1.0); + special_in[idx++] = ComplexType(nan, -1.0); + special_in[idx++] = ComplexType(nan, nan); + + Map special_out(out); + special_out = special_in.cwiseSqrt(); + } + + T x1(in + i); + Map res(out + num_special_inputs + i*T::MaxSizeAtCompileTime); + res = x1.cwiseSqrt(); + } +}; + template struct replicate { EIGEN_DEVICE_FUNC @@ -161,17 +213,58 @@ struct matrix_inverse { } }; +template +bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only +{ + if (a.rows() != b.rows()) { + return false; + } + if (a.cols() != b.cols()) { + return false; + } + for (Index r = 0; r < a.rows(); ++r) { + for (Index c = 0; c < a.cols(); ++c) { + if (a(r, c) != b(r, c) + && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) + && !test_isApprox(a(r, c), b(r, c))) { + return false; + } + } + } + return true; +} + +template +void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out) +{ + Output out_ref, out_gpu; + #if !defined(EIGEN_GPU_COMPILE_PHASE) + out_ref = out_gpu = out; + #else + EIGEN_UNUSED_VARIABLE(in); + EIGEN_UNUSED_VARIABLE(out); + #endif + run_on_cpu (ker, n, in, out_ref); + run_on_gpu(ker, n, in, out_gpu); + #if !defined(EIGEN_GPU_COMPILE_PHASE) + verifyIsApproxWithInfsNans(out_ref, out_gpu); + #endif +} + EIGEN_DECLARE_TEST(gpu_basic) { ei_test_init_gpu(); int nthreads = 100; Eigen::VectorXf in, out; + Eigen::VectorXcf cfin, cfout; - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) int data_size = nthreads * 512; in.setRandom(data_size); - out.setRandom(data_size); + out.setConstant(data_size, -1); + cfin.setRandom(data_size); + cfout.setConstant(data_size, -1); #endif CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise(), nthreads, in, out) ); @@ -204,6 +297,8 @@ EIGEN_DECLARE_TEST(gpu_basic) CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct(), nthreads, in, out) ); + CALL_SUBTEST( test_with_infs_nans(complex_sqrt(), nthreads, cfin, cfout) ); + #if defined(__NVCC__) // FIXME // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda -- cgit v1.2.3