From 070d303d56d46d2e018a58214da24ca629ea454f Mon Sep 17 00:00:00 2001 From: Antonio Sanchez Date: Tue, 22 Dec 2020 22:49:06 -0800 Subject: Add CUDA complex sqrt. This is to support scalar `sqrt` of complex numbers `std::complex` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang). --- Eigen/src/Core/MathFunctions.h | 30 +++++- Eigen/src/Core/arch/CUDA/Complex.h | 55 +++++++++-- .../Core/arch/Default/GenericPacketMathFunctions.h | 4 +- test/CMakeLists.txt | 11 +++ test/gpu_basic.cu | 101 ++++++++++++++++++++- test/gpu_common.h | 34 +++++-- .../CXX11/src/Tensor/TensorGpuHipCudaDefines.h | 8 +- 7 files changed, 216 insertions(+), 27 deletions(-) diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h index 3cf91bdb6..928bc8e72 100644 --- a/Eigen/src/Core/MathFunctions.h +++ b/Eigen/src/Core/MathFunctions.h @@ -323,6 +323,27 @@ struct abs2_retval typedef typename NumTraits::Real type; }; +/**************************************************************************** +* Implementation of sqrt * +****************************************************************************/ + +template +struct sqrt_impl +{ + EIGEN_DEVICE_FUNC + static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x) + { + EIGEN_USING_STD(sqrt); + return sqrt(x); + } +}; + +template +struct sqrt_retval +{ + typedef Scalar type; +}; + /**************************************************************************** * Implementation of norm1 * ****************************************************************************/ @@ -1368,12 +1389,11 @@ inline int log2(int x) * * It's usage is justified in performance critical functions, like norm/normalize. */ -template -EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE -T sqrt(const T &x) +template +EIGEN_DEVICE_FUNC +EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x) { - EIGEN_USING_STD(sqrt); - return sqrt(x); + return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x); } // Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool). diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h index 57d1201f4..69334cafe 100644 --- a/Eigen/src/Core/arch/CUDA/Complex.h +++ b/Eigen/src/Core/arch/CUDA/Complex.h @@ -12,12 +12,12 @@ // clang-format off +#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE) + namespace Eigen { namespace internal { -#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU) - // Many std::complex methods such as operator+, operator-, operator* and // operator/ are not constexpr. Due to this, clang does not treat them as device // functions and thus Eigen functors making use of these operators fail to @@ -94,10 +94,53 @@ template struct scalar_quotient_op, const std: template struct scalar_quotient_op, std::complex > : scalar_quotient_op, const std::complex > {}; -#endif +template +struct sqrt_impl> { + static EIGEN_DEVICE_FUNC std::complex run(const std::complex& z) { + // Computes the principal sqrt of the input. + // + // For a complex square root of the number x + i*y. We want to find real + // numbers u and v such that + // (u + i*v)^2 = x + i*y <=> + // u^2 - v^2 + i*2*u*v = x + i*v. + // By equating the real and imaginary parts we get: + // u^2 - v^2 = x + // 2*u*v = y. + // + // For x >= 0, this has the numerically stable solution + // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) + // v = y / (2 * u) + // and for x < 0, + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = y / (2 * v) + // + // Letting w = sqrt(0.5 * (|x| + |z|)), + // if x == 0: u = w, v = sign(y) * w + // if x > 0: u = w, v = y / (2 * w) + // if x < 0: u = |y| / (2 * w), v = sign(y) * w + + const T x = numext::real(z); + const T y = numext::imag(z); + const T zero = T(0); + const T cst_half = T(0.5); + + // Special case of isinf(y) + if ((numext::isinf)(y)) { + const T inf = std::numeric_limits::infinity(); + return std::complex(inf, y); + } + + T w = numext::sqrt(cst_half * (numext::abs(x) + numext::abs(z))); + return + x == zero ? std::complex(w, y < zero ? -w : w) + : x > zero ? std::complex(w, y / (2 * w)) + : std::complex(numext::abs(y) / (2 * w), y < zero ? -w : w ); + } +}; -} // end namespace internal +} // namespace internal +} // namespace Eigen -} // end namespace Eigen +#endif -#endif // EIGEN_COMPLEX_CUDA_H +#endif // EIGEN_COMPLEX_CUDA_H diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h index a6d2de62b..9253d8cab 100644 --- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h +++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h @@ -703,8 +703,8 @@ Packet psqrt_complex(const Packet& a) { // u = sqrt(0.5 * (x + sqrt(x^2 + y^2))) // v = 0.5 * (y / u) // and for x < 0, - // v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2))) - // u = |0.5 * (y / v)| + // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2))) + // u = 0.5 * (y / v) // // To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as // l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) , diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8eda8d2f1..ce3782171 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -395,6 +395,12 @@ find_package(CUDA 5.0) if(CUDA_FOUND) set(CUDA_PROPAGATE_HOST_FLAGS OFF) + + set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr") + if (${CUDA_VERSION} STREQUAL "7.0") + set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr") + endif() + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE) endif() @@ -404,7 +410,12 @@ if(CUDA_FOUND) foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}") endforeach() + else() + foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH) + string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}") + endforeach() endif() + string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") ei_add_test(gpu_basic) diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu index e8069f185..b82b94d9b 100644 --- a/test/gpu_basic.cu +++ b/test/gpu_basic.cu @@ -14,7 +14,6 @@ #endif #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #include "main.h" @@ -54,6 +53,59 @@ struct coeff_wise { } }; +template +struct complex_sqrt { + EIGEN_DEVICE_FUNC + void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const + { + using namespace Eigen; + typedef typename T::Scalar ComplexType; + typedef typename T::Scalar::value_type ValueType; + const int num_special_inputs = 18; + + if (i == 0) { + const ValueType nan = std::numeric_limits::quiet_NaN(); + typedef Eigen::Vector SpecialInputs; + SpecialInputs special_in; + special_in.setZero(); + int idx = 0; + special_in[idx++] = ComplexType(0, 0); + special_in[idx++] = ComplexType(-0, 0); + special_in[idx++] = ComplexType(0, -0); + special_in[idx++] = ComplexType(-0, -0); + // GCC's fallback sqrt implementation fails for inf inputs. + // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if + // clang includes the GCC header (which temporarily disables + // _GLIBCXX_USE_C99_COMPLEX) + #if !defined(_GLIBCXX_COMPLEX) || \ + (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX)) + const ValueType inf = std::numeric_limits::infinity(); + special_in[idx++] = ComplexType(1.0, inf); + special_in[idx++] = ComplexType(nan, inf); + special_in[idx++] = ComplexType(1.0, -inf); + special_in[idx++] = ComplexType(nan, -inf); + special_in[idx++] = ComplexType(-inf, 1.0); + special_in[idx++] = ComplexType(inf, 1.0); + special_in[idx++] = ComplexType(-inf, -1.0); + special_in[idx++] = ComplexType(inf, -1.0); + special_in[idx++] = ComplexType(-inf, nan); + special_in[idx++] = ComplexType(inf, nan); + #endif + special_in[idx++] = ComplexType(1.0, nan); + special_in[idx++] = ComplexType(nan, 1.0); + special_in[idx++] = ComplexType(nan, -1.0); + special_in[idx++] = ComplexType(nan, nan); + + Map special_out(out); + special_out = special_in.cwiseSqrt(); + } + + T x1(in + i); + Map res(out + num_special_inputs + i*T::MaxSizeAtCompileTime); + res = x1.cwiseSqrt(); + } +}; + template struct replicate { EIGEN_DEVICE_FUNC @@ -161,17 +213,58 @@ struct matrix_inverse { } }; +template +bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only +{ + if (a.rows() != b.rows()) { + return false; + } + if (a.cols() != b.cols()) { + return false; + } + for (Index r = 0; r < a.rows(); ++r) { + for (Index c = 0; c < a.cols(); ++c) { + if (a(r, c) != b(r, c) + && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c))) + && !test_isApprox(a(r, c), b(r, c))) { + return false; + } + } + } + return true; +} + +template +void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out) +{ + Output out_ref, out_gpu; + #if !defined(EIGEN_GPU_COMPILE_PHASE) + out_ref = out_gpu = out; + #else + EIGEN_UNUSED_VARIABLE(in); + EIGEN_UNUSED_VARIABLE(out); + #endif + run_on_cpu (ker, n, in, out_ref); + run_on_gpu(ker, n, in, out_gpu); + #if !defined(EIGEN_GPU_COMPILE_PHASE) + verifyIsApproxWithInfsNans(out_ref, out_gpu); + #endif +} + EIGEN_DECLARE_TEST(gpu_basic) { ei_test_init_gpu(); int nthreads = 100; Eigen::VectorXf in, out; + Eigen::VectorXcf cfin, cfout; - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) int data_size = nthreads * 512; in.setRandom(data_size); - out.setRandom(data_size); + out.setConstant(data_size, -1); + cfin.setRandom(data_size); + cfout.setConstant(data_size, -1); #endif CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise(), nthreads, in, out) ); @@ -204,6 +297,8 @@ EIGEN_DECLARE_TEST(gpu_basic) CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct(), nthreads, in, out) ); CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct(), nthreads, in, out) ); + CALL_SUBTEST( test_with_infs_nans(complex_sqrt(), nthreads, cfin, cfout) ); + #if defined(__NVCC__) // FIXME // These subtests compiles only with nvcc and fail with HIPCC and clang-cuda diff --git a/test/gpu_common.h b/test/gpu_common.h index 049e7aade..fe0485e98 100644 --- a/test/gpu_common.h +++ b/test/gpu_common.h @@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out) #else run_on_gpu_meta_kernel<<>>(ker, n, d_in, d_out); #endif + // Pre-launch errors. + gpuError_t err = gpuGetLastError(); + if (err != gpuSuccess) { + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); + gpu_assert(false); + } + + // Kernel execution errors. + err = gpuDeviceSynchronize(); + if (err != gpuSuccess) { + printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err)); + gpu_assert(false); + } - gpuDeviceSynchronize(); // check inputs have not been modified gpuMemcpy(const_cast(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost); @@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o { Input in_ref, in_gpu; Output out_ref, out_gpu; - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) in_ref = in_gpu = in; out_ref = out_gpu = out; #else @@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o #endif run_on_cpu (ker, n, in_ref, out_ref); run_on_gpu(ker, n, in_gpu, out_gpu); - #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__) + #if !defined(EIGEN_GPU_COMPILE_PHASE) VERIFY_IS_APPROX(in_ref, in_gpu); VERIFY_IS_APPROX(out_ref, out_gpu); #endif @@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o struct compile_time_device_info { EIGEN_DEVICE_FUNC - void operator()(int /*i*/, const int* /*in*/, int* info) const + void operator()(int i, const int* /*in*/, int* info) const { - #if defined(__CUDA_ARCH__) - info[0] = int(__CUDA_ARCH__ +0); - #endif - #if defined(EIGEN_HIP_DEVICE_COMPILE) - info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); - #endif + if (i == 0) { + #if defined(__CUDA_ARCH__) + info[0] = int(__CUDA_ARCH__ +0); + #endif + #if defined(EIGEN_HIP_DEVICE_COMPILE) + info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0); + #endif + } } }; diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h index f32ce27e9..cb53ce298 100644 --- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h +++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h @@ -16,7 +16,7 @@ // for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler // When compiling such files, gcc will end up trying to pick up the CUDA headers by // default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU) -// This will obsviously not work when trying to compile tensorflow on a system with no CUDA +// This will obviously not work when trying to compile tensorflow on a system with no CUDA // To work around this issue for HIP systems (and leave the default behaviour intact), the // HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and // "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is @@ -30,6 +30,9 @@ #define gpuSuccess hipSuccess #define gpuErrorNotReady hipErrorNotReady #define gpuGetDeviceCount hipGetDeviceCount +#define gpuGetLastError hipGetLastError +#define gpuPeekAtLastError hipPeekAtLastError +#define gpuGetErrorName hipGetErrorName #define gpuGetErrorString hipGetErrorString #define gpuGetDeviceProperties hipGetDeviceProperties #define gpuStreamDefault hipStreamDefault @@ -57,6 +60,9 @@ #define gpuSuccess cudaSuccess #define gpuErrorNotReady cudaErrorNotReady #define gpuGetDeviceCount cudaGetDeviceCount +#define gpuGetLastError cudaGetLastError +#define gpuPeekAtLastError cudaPeekAtLastError +#define gpuGetErrorName cudaGetErrorName #define gpuGetErrorString cudaGetErrorString #define gpuGetDeviceProperties cudaGetDeviceProperties #define gpuStreamDefault cudaStreamDefault -- cgit v1.2.3