aboutsummaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorGravatar Antonio Sanchez <cantonios@google.com>2020-12-22 22:49:06 -0800
committerGravatar Antonio Sanchez <cantonios@google.com>2020-12-22 23:25:23 -0800
commit070d303d56d46d2e018a58214da24ca629ea454f (patch)
tree3dfa72bf48ffdca0a67bd794596e4e452d50ed19
parentfdf2ee62c5174441076fb64c9737d89bbe102759 (diff)
Add CUDA complex sqrt.
This is to support scalar `sqrt` of complex numbers `std::complex<T>` on device, requested by Tensorflow folks. Technically `std::complex` is not supported by NVCC on device (though it is by clang), so the default `sqrt(std::complex<T>)` function only works on the host. Here we create an overload to add back the functionality. Also modified the CMake file to add `--relaxed-constexpr` (or equivalent) flag for NVCC to allow calling constexpr functions from device functions, and added support for specifying compute architecture for NVCC (was already available for clang).
-rw-r--r--Eigen/src/Core/MathFunctions.h30
-rw-r--r--Eigen/src/Core/arch/CUDA/Complex.h55
-rw-r--r--Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h4
-rw-r--r--test/CMakeLists.txt11
-rw-r--r--test/gpu_basic.cu101
-rw-r--r--test/gpu_common.h34
-rw-r--r--unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h8
7 files changed, 216 insertions, 27 deletions
diff --git a/Eigen/src/Core/MathFunctions.h b/Eigen/src/Core/MathFunctions.h
index 3cf91bdb6..928bc8e72 100644
--- a/Eigen/src/Core/MathFunctions.h
+++ b/Eigen/src/Core/MathFunctions.h
@@ -324,6 +324,27 @@ struct abs2_retval
};
/****************************************************************************
+* Implementation of sqrt *
+****************************************************************************/
+
+template<typename Scalar>
+struct sqrt_impl
+{
+ EIGEN_DEVICE_FUNC
+ static EIGEN_ALWAYS_INLINE Scalar run(const Scalar& x)
+ {
+ EIGEN_USING_STD(sqrt);
+ return sqrt(x);
+ }
+};
+
+template<typename Scalar>
+struct sqrt_retval
+{
+ typedef Scalar type;
+};
+
+/****************************************************************************
* Implementation of norm1 *
****************************************************************************/
@@ -1368,12 +1389,11 @@ inline int log2(int x)
*
* It's usage is justified in performance critical functions, like norm/normalize.
*/
-template<typename T>
-EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE
-T sqrt(const T &x)
+template<typename Scalar>
+EIGEN_DEVICE_FUNC
+EIGEN_ALWAYS_INLINE EIGEN_MATHFUNC_RETVAL(sqrt, Scalar) sqrt(const Scalar& x)
{
- EIGEN_USING_STD(sqrt);
- return sqrt(x);
+ return EIGEN_MATHFUNC_IMPL(sqrt, Scalar)::run(x);
}
// Boolean specialization, avoids implicit float to bool conversion (-Wimplicit-conversion-floating-point-to-bool).
diff --git a/Eigen/src/Core/arch/CUDA/Complex.h b/Eigen/src/Core/arch/CUDA/Complex.h
index 57d1201f4..69334cafe 100644
--- a/Eigen/src/Core/arch/CUDA/Complex.h
+++ b/Eigen/src/Core/arch/CUDA/Complex.h
@@ -12,12 +12,12 @@
// clang-format off
+#if defined(EIGEN_CUDACC) && defined(EIGEN_GPU_COMPILE_PHASE)
+
namespace Eigen {
namespace internal {
-#if defined(EIGEN_CUDACC) && defined(EIGEN_USE_GPU)
-
// Many std::complex methods such as operator+, operator-, operator* and
// operator/ are not constexpr. Due to this, clang does not treat them as device
// functions and thus Eigen functors making use of these operators fail to
@@ -94,10 +94,53 @@ template<typename T> struct scalar_quotient_op<const std::complex<T>, const std:
template<typename T> struct scalar_quotient_op<std::complex<T>, std::complex<T> > : scalar_quotient_op<const std::complex<T>, const std::complex<T> > {};
-#endif
+template<typename T>
+struct sqrt_impl<std::complex<T>> {
+ static EIGEN_DEVICE_FUNC std::complex<T> run(const std::complex<T>& z) {
+ // Computes the principal sqrt of the input.
+ //
+ // For a complex square root of the number x + i*y. We want to find real
+ // numbers u and v such that
+ // (u + i*v)^2 = x + i*y <=>
+ // u^2 - v^2 + i*2*u*v = x + i*v.
+ // By equating the real and imaginary parts we get:
+ // u^2 - v^2 = x
+ // 2*u*v = y.
+ //
+ // For x >= 0, this has the numerically stable solution
+ // u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
+ // v = y / (2 * u)
+ // and for x < 0,
+ // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+ // u = y / (2 * v)
+ //
+ // Letting w = sqrt(0.5 * (|x| + |z|)),
+ // if x == 0: u = w, v = sign(y) * w
+ // if x > 0: u = w, v = y / (2 * w)
+ // if x < 0: u = |y| / (2 * w), v = sign(y) * w
+
+ const T x = numext::real(z);
+ const T y = numext::imag(z);
+ const T zero = T(0);
+ const T cst_half = T(0.5);
+
+ // Special case of isinf(y)
+ if ((numext::isinf)(y)) {
+ const T inf = std::numeric_limits<T>::infinity();
+ return std::complex<T>(inf, y);
+ }
+
+ T w = numext::sqrt(cst_half * (numext::abs(x) + numext::abs(z)));
+ return
+ x == zero ? std::complex<T>(w, y < zero ? -w : w)
+ : x > zero ? std::complex<T>(w, y / (2 * w))
+ : std::complex<T>(numext::abs(y) / (2 * w), y < zero ? -w : w );
+ }
+};
-} // end namespace internal
+} // namespace internal
+} // namespace Eigen
-} // end namespace Eigen
+#endif
-#endif // EIGEN_COMPLEX_CUDA_H
+#endif // EIGEN_COMPLEX_CUDA_H
diff --git a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
index a6d2de62b..9253d8cab 100644
--- a/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
+++ b/Eigen/src/Core/arch/Default/GenericPacketMathFunctions.h
@@ -703,8 +703,8 @@ Packet psqrt_complex(const Packet& a) {
// u = sqrt(0.5 * (x + sqrt(x^2 + y^2)))
// v = 0.5 * (y / u)
// and for x < 0,
- // v = sign(y) * sqrt(0.5 * (x + sqrt(x^2 + y^2)))
- // u = |0.5 * (y / v)|
+ // v = sign(y) * sqrt(0.5 * (-x + sqrt(x^2 + y^2)))
+ // u = 0.5 * (y / v)
//
// To avoid unnecessary over- and underflow, we compute sqrt(x^2 + y^2) as
// l = max(|x|, |y|) * sqrt(1 + (min(|x|, |y|) / max(|x|, |y|))^2) ,
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8eda8d2f1..ce3782171 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -395,6 +395,12 @@ find_package(CUDA 5.0)
if(CUDA_FOUND)
set(CUDA_PROPAGATE_HOST_FLAGS OFF)
+
+ set(EIGEN_CUDA_RELAXED_CONSTEXPR "--expt-relaxed-constexpr")
+ if (${CUDA_VERSION} STREQUAL "7.0")
+ set(EIGEN_CUDA_RELAXED_CONSTEXPR "--relaxed-constexpr")
+ endif()
+
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang")
set(CUDA_NVCC_FLAGS "-ccbin ${CMAKE_C_COMPILER}" CACHE STRING "nvcc flags" FORCE)
endif()
@@ -404,7 +410,12 @@ if(CUDA_FOUND)
foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
string(APPEND CMAKE_CXX_FLAGS " --cuda-gpu-arch=sm_${GPU}")
endforeach()
+ else()
+ foreach(GPU IN LISTS EIGEN_CUDA_COMPUTE_ARCH)
+ string(APPEND CUDA_NVCC_FLAGS " -gencode arch=compute_${GPU},code=sm_${GPU}")
+ endforeach()
endif()
+ string(APPEND CUDA_NVCC_FLAGS " ${EIGEN_CUDA_RELAXED_CONSTEXPR}")
set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu")
ei_add_test(gpu_basic)
diff --git a/test/gpu_basic.cu b/test/gpu_basic.cu
index e8069f185..b82b94d9b 100644
--- a/test/gpu_basic.cu
+++ b/test/gpu_basic.cu
@@ -14,7 +14,6 @@
#endif
#define EIGEN_TEST_NO_LONGDOUBLE
-#define EIGEN_TEST_NO_COMPLEX
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#include "main.h"
@@ -55,6 +54,59 @@ struct coeff_wise {
};
template<typename T>
+struct complex_sqrt {
+ EIGEN_DEVICE_FUNC
+ void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
+ {
+ using namespace Eigen;
+ typedef typename T::Scalar ComplexType;
+ typedef typename T::Scalar::value_type ValueType;
+ const int num_special_inputs = 18;
+
+ if (i == 0) {
+ const ValueType nan = std::numeric_limits<ValueType>::quiet_NaN();
+ typedef Eigen::Vector<ComplexType, num_special_inputs> SpecialInputs;
+ SpecialInputs special_in;
+ special_in.setZero();
+ int idx = 0;
+ special_in[idx++] = ComplexType(0, 0);
+ special_in[idx++] = ComplexType(-0, 0);
+ special_in[idx++] = ComplexType(0, -0);
+ special_in[idx++] = ComplexType(-0, -0);
+ // GCC's fallback sqrt implementation fails for inf inputs.
+ // It is called when _GLIBCXX_USE_C99_COMPLEX is false or if
+ // clang includes the GCC header (which temporarily disables
+ // _GLIBCXX_USE_C99_COMPLEX)
+ #if !defined(_GLIBCXX_COMPLEX) || \
+ (_GLIBCXX_USE_C99_COMPLEX && !defined(__CLANG_CUDA_WRAPPERS_COMPLEX))
+ const ValueType inf = std::numeric_limits<ValueType>::infinity();
+ special_in[idx++] = ComplexType(1.0, inf);
+ special_in[idx++] = ComplexType(nan, inf);
+ special_in[idx++] = ComplexType(1.0, -inf);
+ special_in[idx++] = ComplexType(nan, -inf);
+ special_in[idx++] = ComplexType(-inf, 1.0);
+ special_in[idx++] = ComplexType(inf, 1.0);
+ special_in[idx++] = ComplexType(-inf, -1.0);
+ special_in[idx++] = ComplexType(inf, -1.0);
+ special_in[idx++] = ComplexType(-inf, nan);
+ special_in[idx++] = ComplexType(inf, nan);
+ #endif
+ special_in[idx++] = ComplexType(1.0, nan);
+ special_in[idx++] = ComplexType(nan, 1.0);
+ special_in[idx++] = ComplexType(nan, -1.0);
+ special_in[idx++] = ComplexType(nan, nan);
+
+ Map<SpecialInputs> special_out(out);
+ special_out = special_in.cwiseSqrt();
+ }
+
+ T x1(in + i);
+ Map<T> res(out + num_special_inputs + i*T::MaxSizeAtCompileTime);
+ res = x1.cwiseSqrt();
+ }
+};
+
+template<typename T>
struct replicate {
EIGEN_DEVICE_FUNC
void operator()(int i, const typename T::Scalar* in, typename T::Scalar* out) const
@@ -161,17 +213,58 @@ struct matrix_inverse {
}
};
+template<typename Type1, typename Type2>
+bool verifyIsApproxWithInfsNans(const Type1& a, const Type2& b, typename Type1::Scalar* = 0) // Enabled for Eigen's type only
+{
+ if (a.rows() != b.rows()) {
+ return false;
+ }
+ if (a.cols() != b.cols()) {
+ return false;
+ }
+ for (Index r = 0; r < a.rows(); ++r) {
+ for (Index c = 0; c < a.cols(); ++c) {
+ if (a(r, c) != b(r, c)
+ && !((numext::isnan)(a(r, c)) && (numext::isnan)(b(r, c)))
+ && !test_isApprox(a(r, c), b(r, c))) {
+ return false;
+ }
+ }
+ }
+ return true;
+}
+
+template<typename Kernel, typename Input, typename Output>
+void test_with_infs_nans(const Kernel& ker, int n, const Input& in, Output& out)
+{
+ Output out_ref, out_gpu;
+ #if !defined(EIGEN_GPU_COMPILE_PHASE)
+ out_ref = out_gpu = out;
+ #else
+ EIGEN_UNUSED_VARIABLE(in);
+ EIGEN_UNUSED_VARIABLE(out);
+ #endif
+ run_on_cpu (ker, n, in, out_ref);
+ run_on_gpu(ker, n, in, out_gpu);
+ #if !defined(EIGEN_GPU_COMPILE_PHASE)
+ verifyIsApproxWithInfsNans(out_ref, out_gpu);
+ #endif
+}
+
EIGEN_DECLARE_TEST(gpu_basic)
{
ei_test_init_gpu();
int nthreads = 100;
Eigen::VectorXf in, out;
+ Eigen::VectorXcf cfin, cfout;
- #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+ #if !defined(EIGEN_GPU_COMPILE_PHASE)
int data_size = nthreads * 512;
in.setRandom(data_size);
- out.setRandom(data_size);
+ out.setConstant(data_size, -1);
+ cfin.setRandom(data_size);
+ cfout.setConstant(data_size, -1);
#endif
CALL_SUBTEST( run_and_compare_to_gpu(coeff_wise<Vector3f>(), nthreads, in, out) );
@@ -204,6 +297,8 @@ EIGEN_DECLARE_TEST(gpu_basic)
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix3f>(), nthreads, in, out) );
CALL_SUBTEST( run_and_compare_to_gpu(eigenvalues_direct<Matrix2f>(), nthreads, in, out) );
+ CALL_SUBTEST( test_with_infs_nans(complex_sqrt<Vector3cf>(), nthreads, cfin, cfout) );
+
#if defined(__NVCC__)
// FIXME
// These subtests compiles only with nvcc and fail with HIPCC and clang-cuda
diff --git a/test/gpu_common.h b/test/gpu_common.h
index 049e7aade..fe0485e98 100644
--- a/test/gpu_common.h
+++ b/test/gpu_common.h
@@ -68,8 +68,20 @@ void run_on_gpu(const Kernel& ker, int n, const Input& in, Output& out)
#else
run_on_gpu_meta_kernel<<<Grids,Blocks>>>(ker, n, d_in, d_out);
#endif
+ // Pre-launch errors.
+ gpuError_t err = gpuGetLastError();
+ if (err != gpuSuccess) {
+ printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+ gpu_assert(false);
+ }
+
+ // Kernel execution errors.
+ err = gpuDeviceSynchronize();
+ if (err != gpuSuccess) {
+ printf("%s: %s\n", gpuGetErrorName(err), gpuGetErrorString(err));
+ gpu_assert(false);
+ }
- gpuDeviceSynchronize();
// check inputs have not been modified
gpuMemcpy(const_cast<typename Input::Scalar*>(in.data()), d_in, in_bytes, gpuMemcpyDeviceToHost);
@@ -85,7 +97,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
{
Input in_ref, in_gpu;
Output out_ref, out_gpu;
- #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+ #if !defined(EIGEN_GPU_COMPILE_PHASE)
in_ref = in_gpu = in;
out_ref = out_gpu = out;
#else
@@ -94,7 +106,7 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
#endif
run_on_cpu (ker, n, in_ref, out_ref);
run_on_gpu(ker, n, in_gpu, out_gpu);
- #if !defined(__CUDA_ARCH__) && !defined(__HIP_DEVICE_COMPILE__)
+ #if !defined(EIGEN_GPU_COMPILE_PHASE)
VERIFY_IS_APPROX(in_ref, in_gpu);
VERIFY_IS_APPROX(out_ref, out_gpu);
#endif
@@ -102,14 +114,16 @@ void run_and_compare_to_gpu(const Kernel& ker, int n, const Input& in, Output& o
struct compile_time_device_info {
EIGEN_DEVICE_FUNC
- void operator()(int /*i*/, const int* /*in*/, int* info) const
+ void operator()(int i, const int* /*in*/, int* info) const
{
- #if defined(__CUDA_ARCH__)
- info[0] = int(__CUDA_ARCH__ +0);
- #endif
- #if defined(EIGEN_HIP_DEVICE_COMPILE)
- info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
- #endif
+ if (i == 0) {
+ #if defined(__CUDA_ARCH__)
+ info[0] = int(__CUDA_ARCH__ +0);
+ #endif
+ #if defined(EIGEN_HIP_DEVICE_COMPILE)
+ info[1] = int(EIGEN_HIP_DEVICE_COMPILE +0);
+ #endif
+ }
}
};
diff --git a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
index f32ce27e9..cb53ce298 100644
--- a/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
+++ b/unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h
@@ -16,7 +16,7 @@
// for some reason gets sent to the gcc/host compiler instead of the gpu/nvcc/hipcc compiler
// When compiling such files, gcc will end up trying to pick up the CUDA headers by
// default (see the code within "unsupported/Eigen/CXX11/Tensor" that is guarded by EIGEN_USE_GPU)
-// This will obsviously not work when trying to compile tensorflow on a system with no CUDA
+// This will obviously not work when trying to compile tensorflow on a system with no CUDA
// To work around this issue for HIP systems (and leave the default behaviour intact), the
// HIP tensorflow build defines EIGEN_USE_HIP when compiling all source files, and
// "unsupported/Eigen/CXX11/Tensor" has been updated to use HIP header when EIGEN_USE_HIP is
@@ -30,6 +30,9 @@
#define gpuSuccess hipSuccess
#define gpuErrorNotReady hipErrorNotReady
#define gpuGetDeviceCount hipGetDeviceCount
+#define gpuGetLastError hipGetLastError
+#define gpuPeekAtLastError hipPeekAtLastError
+#define gpuGetErrorName hipGetErrorName
#define gpuGetErrorString hipGetErrorString
#define gpuGetDeviceProperties hipGetDeviceProperties
#define gpuStreamDefault hipStreamDefault
@@ -57,6 +60,9 @@
#define gpuSuccess cudaSuccess
#define gpuErrorNotReady cudaErrorNotReady
#define gpuGetDeviceCount cudaGetDeviceCount
+#define gpuGetLastError cudaGetLastError
+#define gpuPeekAtLastError cudaPeekAtLastError
+#define gpuGetErrorName cudaGetErrorName
#define gpuGetErrorString cudaGetErrorString
#define gpuGetDeviceProperties cudaGetDeviceProperties
#define gpuStreamDefault cudaStreamDefault