diff options
author | Deven Desai <deven.desai.amd@gmail.com> | 2018-06-20 16:44:58 -0400 |
---|---|---|
committer | Deven Desai <deven.desai.amd@gmail.com> | 2018-06-20 16:44:58 -0400 |
commit | 1bb6fa99a31d2dcf5431087d3f238e2dcca03084 (patch) | |
tree | e62d41b8d6430849aea4bf97785a54488bf542d4 /unsupported/test | |
parent | cfdabbcc8f708c06da2bfa4e924edc25619f013a (diff) |
merging the CUDA and HIP implementation for the Tensor directory and the unit tests
Diffstat (limited to 'unsupported/test')
-rw-r--r-- | unsupported/test/CMakeLists.txt | 49 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_argmax_gpu.cu | 90 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_cast_float16_gpu.cu | 10 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu | 2 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_complex_gpu.cu | 8 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_contract_gpu.cu | 92 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_device.cu | 58 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_gpu.cu | 706 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_of_float16_gpu.cu | 80 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_random_gpu.cu | 29 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_reduction_gpu.cu | 10 | ||||
-rw-r--r-- | unsupported/test/cxx11_tensor_scan_gpu.cu | 25 |
12 files changed, 598 insertions, 561 deletions
diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 05b141e21..97e0669a6 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -274,24 +274,24 @@ if(CUDA_FOUND AND EIGEN_TEST_CUDA) cuda_include_directories("${CMAKE_CURRENT_BINARY_DIR}" "${CUDA_TOOLKIT_ROOT_DIR}/include") set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - ei_add_test(cxx11_tensor_complex_cuda) - ei_add_test(cxx11_tensor_complex_cwise_ops_cuda) - ei_add_test(cxx11_tensor_reduction_cuda) - ei_add_test(cxx11_tensor_argmax_cuda) - ei_add_test(cxx11_tensor_cast_float16_cuda) - ei_add_test(cxx11_tensor_scan_cuda) + ei_add_test(cxx11_tensor_complex_gpu) + ei_add_test(cxx11_tensor_complex_cwise_ops_gpu) + ei_add_test(cxx11_tensor_reduction_gpu) + ei_add_test(cxx11_tensor_argmax_gpu) + ei_add_test(cxx11_tensor_cast_float16_gpu) + ei_add_test(cxx11_tensor_scan_gpu) # Contractions require arch 3.0 or higher if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 29) ei_add_test(cxx11_tensor_device) - ei_add_test(cxx11_tensor_cuda) - ei_add_test(cxx11_tensor_contract_cuda) - ei_add_test(cxx11_tensor_of_float16_cuda) + ei_add_test(cxx11_tensor_gpu) + ei_add_test(cxx11_tensor_contract_gpu) + ei_add_test(cxx11_tensor_of_float16_gpu) endif() # The random number generation code requires arch 3.5 or greater. if (${EIGEN_CUDA_COMPUTE_ARCH} GREATER 34) - ei_add_test(cxx11_tensor_random_cuda) + ei_add_test(cxx11_tensor_random_gpu) endif() @@ -318,18 +318,23 @@ if (EIGEN_TEST_HIP) include_directories(${HIP_PATH}/include) set(EIGEN_ADD_TEST_FILENAME_EXTENSION "cu") - - # ei_add_test(cxx11_tensor_complex_hip) - # ei_add_test(cxx11_tensor_complex_cwise_ops_hip) - ei_add_test(cxx11_tensor_reduction_hip) - ei_add_test(cxx11_tensor_argmax_hip) - ei_add_test(cxx11_tensor_cast_float16_hip) - ei_add_test(cxx11_tensor_scan_hip) - ei_add_test(cxx11_tensor_device_hip) - ei_add_test(cxx11_tensor_hip) - ei_add_test(cxx11_tensor_contract_hip) - ei_add_test(cxx11_tensor_of_float16_hip) - ei_add_test(cxx11_tensor_random_hip) + # + # complex datatype is not yet supported by HIP + # so leaving out those tests for now + # + # ei_add_test(cxx11_tensor_complex_gpu) + # ei_add_test(cxx11_tensor_complex_cwise_ops_gpu) + # + ei_add_test(cxx11_tensor_reduction_gpu) + ei_add_test(cxx11_tensor_argmax_gpu) + ei_add_test(cxx11_tensor_cast_float16_gpu) + ei_add_test(cxx11_tensor_scan_gpu) + ei_add_test(cxx11_tensor_device) + + ei_add_test(cxx11_tensor_gpu) + ei_add_test(cxx11_tensor_contract_gpu) + ei_add_test(cxx11_tensor_of_float16_gpu) + ei_add_test(cxx11_tensor_random_gpu) unset(EIGEN_ADD_TEST_FILENAME_EXTENSION) diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu index 3d73d491a..541a27865 100644 --- a/unsupported/test/cxx11_tensor_argmax_gpu.cu +++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu @@ -9,16 +9,18 @@ #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_gpu #define EIGEN_USE_GPU #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + using Eigen::Tensor; template <int Layout> -void test_cuda_simple_argmax() +void test_gpu_simple_argmax() { Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); @@ -34,13 +36,13 @@ void test_cuda_simple_argmax() double* d_in; DenseIndex* d_out_max; DenseIndex* d_out_min; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out_max), out_bytes); - cudaMalloc((void**)(&d_out_min), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out_max), out_bytes); + gpuMalloc((void**)(&d_out_min), out_bytes); - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); @@ -50,20 +52,20 @@ void test_cuda_simple_argmax() gpu_out_max.device(gpu_device) = gpu_in.argmax(); gpu_out_min.device(gpu_device) = gpu_in.argmin(); - assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); - cudaFree(d_in); - cudaFree(d_out_max); - cudaFree(d_out_min); + gpuFree(d_in); + gpuFree(d_out_max); + gpuFree(d_out_min); } template <int DataLayout> -void test_cuda_argmax_dim() +void test_gpu_argmax_dim() { Tensor<float, 4, DataLayout> tensor(2,3,5,7); std::vector<int> dims; @@ -97,12 +99,12 @@ void test_cuda_argmax_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); @@ -110,8 +112,8 @@ void test_cuda_argmax_dim() gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), size_t(2*3*5*7 / tensor.dimension(dim))); @@ -134,25 +136,25 @@ void test_cuda_argmax_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } template <int DataLayout> -void test_cuda_argmin_dim() +void test_gpu_argmin_dim() { Tensor<float, 4, DataLayout> tensor(2,3,5,7); std::vector<int> dims; @@ -186,12 +188,12 @@ void test_cuda_argmin_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); @@ -199,8 +201,8 @@ void test_cuda_argmin_dim() gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), 2*3*5*7 / tensor.dimension(dim)); @@ -223,29 +225,29 @@ void test_cuda_argmin_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } -void test_cxx11_tensor_cuda() +void test_cxx11_tensor_gpu() { - CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>()); - CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>()); - CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>()); - CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>()); - CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>()); - CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>()); + CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>()); + CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>()); + CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>()); + CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>()); + CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>()); + CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>()); } diff --git a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu index 816e03220..a2928b0b3 100644 --- a/unsupported/test/cxx11_tensor_cast_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_cast_float16_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_cast_float16_gpu #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU @@ -18,8 +18,8 @@ using Eigen::Tensor; -void test_cuda_conversion() { - Eigen::CudaStreamDevice stream; +void test_gpu_conversion() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -72,8 +72,8 @@ void test_fallback_conversion() { } -void test_cxx11_tensor_cast_float16_cuda() +void test_cxx11_tensor_cast_float16_gpu() { - CALL_SUBTEST(test_cuda_conversion()); + CALL_SUBTEST(test_gpu_conversion()); CALL_SUBTEST(test_fallback_conversion()); } diff --git a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu index aac780905..af67348aa 100644 --- a/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_cwise_ops_gpu.cu @@ -28,7 +28,7 @@ void test_cuda_complex_cwise_ops() { cudaMalloc((void**)(&d_in2), complex_bytes); cudaMalloc((void**)(&d_out), complex_bytes); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<std::complex<T>, 1, 0, int>, Eigen::Aligned> gpu_in1( diff --git a/unsupported/test/cxx11_tensor_complex_gpu.cu b/unsupported/test/cxx11_tensor_complex_gpu.cu index a52350f85..45b49d266 100644 --- a/unsupported/test/cxx11_tensor_complex_gpu.cu +++ b/unsupported/test/cxx11_tensor_complex_gpu.cu @@ -34,7 +34,7 @@ void test_cuda_nullary() { cudaMemcpy(d_in1, in1.data(), complex_bytes, cudaMemcpyHostToDevice); cudaMemcpy(d_in2, in2.data(), complex_bytes, cudaMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<std::complex<float>, 1, 0, int>, Eigen::Aligned> gpu_in1( @@ -70,7 +70,7 @@ void test_cuda_nullary() { static void test_cuda_sum_reductions() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); const int num_rows = internal::random<int>(1024, 5*1024); @@ -106,7 +106,7 @@ static void test_cuda_sum_reductions() { static void test_cuda_mean_reductions() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); const int num_rows = internal::random<int>(1024, 5*1024); @@ -142,7 +142,7 @@ static void test_cuda_mean_reductions() { static void test_cuda_product_reductions() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); const int num_rows = internal::random<int>(1024, 5*1024); diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu index 3621e2aa6..061d0464e 100644 --- a/unsupported/test/cxx11_tensor_contract_gpu.cu +++ b/unsupported/test/cxx11_tensor_contract_gpu.cu @@ -10,19 +10,20 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_gpu #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; template<int DataLayout> -void test_cuda_contraction(int m_size, int k_size, int n_size) +void test_gpu_contraction(int m_size, int k_size, int n_size) { std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is @@ -45,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > @@ -66,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); t_result = t_left.contract(t_right, dims); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); for (DenseIndex i = 0; i < t_result.size(); i++) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { continue; @@ -79,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_left); - cudaFree((void*)d_t_right); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_left); + gpuFree((void*)d_t_right); + gpuFree((void*)d_t_result); } @@ -109,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size) float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> > @@ -129,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); t_result = t_left.contract(t_right, dims); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); if (fabs(t_result() - t_result_gpu()) > 1e-4f && !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { std::cout << "mismatch detected: " << t_result() @@ -137,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_left); - cudaFree((void*)d_t_right); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_left); + gpuFree((void*)d_t_right); + gpuFree((void*)d_t_result); } template<int DataLayout> -void test_cuda_contraction_m() { +void test_gpu_contraction_m() { for (int k = 32; k < 256; k++) { - test_cuda_contraction<ColMajor>(k, 128, 128); - test_cuda_contraction<RowMajor>(k, 128, 128); + test_gpu_contraction<ColMajor>(k, 128, 128); + test_gpu_contraction<RowMajor>(k, 128, 128); } } template<int DataLayout> -void test_cuda_contraction_k() { +void test_gpu_contraction_k() { for (int k = 32; k < 256; k++) { - test_cuda_contraction<ColMajor>(128, k, 128); - test_cuda_contraction<RowMajor>(128, k, 128); + test_gpu_contraction<ColMajor>(128, k, 128); + test_gpu_contraction<RowMajor>(128, k, 128); } } template<int DataLayout> -void test_cuda_contraction_n() { +void test_gpu_contraction_n() { for (int k = 32; k < 256; k++) { - test_cuda_contraction<ColMajor>(128, 128, k); - test_cuda_contraction<RowMajor>(128, 128, k); + test_gpu_contraction<ColMajor>(128, 128, k); + test_gpu_contraction<RowMajor>(128, 128, k); } } template<int DataLayout> -void test_cuda_contraction_sizes() { +void test_gpu_contraction_sizes() { int m_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257 , 511, 512, 513, 1023, 1024, 1025}; @@ -186,29 +187,32 @@ void test_cuda_contraction_sizes() { for (int i = 0; i < 15; i++) { for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); + test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]); } } } } -void test_cxx11_tensor_cuda() +void test_cxx11_tensor_gpu() { - CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128)); - CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128)); CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128)); - CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>()); - CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>()); + CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>()); + CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>()); - CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>()); - CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>()); + CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>()); + CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>()); - CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>()); - CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>()); + CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>()); + CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>()); - CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>()); - CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>()); +#if !defined(EIGEN_USE_HIP) +// disable these subtests for HIP + CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>()); + CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>()); +#endif } diff --git a/unsupported/test/cxx11_tensor_device.cu b/unsupported/test/cxx11_tensor_device.cu index 7c14bc187..52215fc39 100644 --- a/unsupported/test/cxx11_tensor_device.cu +++ b/unsupported/test/cxx11_tensor_device.cu @@ -16,6 +16,7 @@ #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> using Eigen::Tensor; using Eigen::RowMajor; @@ -66,22 +67,22 @@ struct CPUContext { // Context for evaluation on GPU struct GPUContext { GPUContext(const Eigen::TensorMap<Eigen::Tensor<float, 3> >& in1, Eigen::TensorMap<Eigen::Tensor<float, 3> >& in2, Eigen::TensorMap<Eigen::Tensor<float, 3> >& out) : in1_(in1), in2_(in2), out_(out), gpu_device_(&stream_) { - assert(cudaMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == cudaSuccess); + assert(gpuMalloc((void**)(&kernel_1d_), 2*sizeof(float)) == gpuSuccess); float kernel_1d_val[] = {3.14f, 2.7f}; - assert(cudaMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + assert(gpuMemcpy(kernel_1d_, kernel_1d_val, 2*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); - assert(cudaMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == cudaSuccess); + assert(gpuMalloc((void**)(&kernel_2d_), 4*sizeof(float)) == gpuSuccess); float kernel_2d_val[] = {3.14f, 2.7f, 0.2f, 7.0f}; - assert(cudaMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + assert(gpuMemcpy(kernel_2d_, kernel_2d_val, 4*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); - assert(cudaMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == cudaSuccess); + assert(gpuMalloc((void**)(&kernel_3d_), 8*sizeof(float)) == gpuSuccess); float kernel_3d_val[] = {3.14f, -1.0f, 2.7f, -0.3f, 0.2f, -0.7f, 7.0f, -0.5f}; - assert(cudaMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), cudaMemcpyHostToDevice) == cudaSuccess); + assert(gpuMemcpy(kernel_3d_, kernel_3d_val, 8*sizeof(float), gpuMemcpyHostToDevice) == gpuSuccess); } ~GPUContext() { - assert(cudaFree(kernel_1d_) == cudaSuccess); - assert(cudaFree(kernel_2d_) == cudaSuccess); - assert(cudaFree(kernel_3d_) == cudaSuccess); + assert(gpuFree(kernel_1d_) == gpuSuccess); + assert(gpuFree(kernel_2d_) == gpuSuccess); + assert(gpuFree(kernel_3d_) == gpuSuccess); } const Eigen::GpuDevice& device() const { return gpu_device_; } @@ -102,7 +103,7 @@ struct GPUContext { float* kernel_2d_; float* kernel_3d_; - Eigen::CudaStreamDevice stream_; + Eigen::GpuStreamDevice stream_; Eigen::GpuDevice gpu_device_; }; @@ -281,12 +282,12 @@ void test_gpu() { float* d_in1; float* d_in2; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_in2), in2_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, 40,50,70); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in2(d_in2, 40,50,70); @@ -294,7 +295,7 @@ void test_gpu() { GPUContext context(gpu_in1, gpu_in2, gpu_out); test_contextual_eval(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { @@ -304,7 +305,7 @@ void test_gpu() { } test_forced_contextual_eval(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { @@ -314,7 +315,7 @@ void test_gpu() { } test_compound_assignment(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 50; ++j) { for (int k = 0; k < 70; ++k) { @@ -324,7 +325,7 @@ void test_gpu() { } test_contraction(&context); - assert(cudaMemcpy(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost) == cudaSuccess); + assert(gpuMemcpy(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 40; ++j) { const float result = out(i,j,0); @@ -339,8 +340,8 @@ void test_gpu() { } test_1d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); + assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 70; ++k) { @@ -350,8 +351,8 @@ void test_gpu() { } test_2d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); + assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 40; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { @@ -363,9 +364,13 @@ void test_gpu() { } } +#if !defined(EIGEN_USE_HIP) +// disable this test on the HIP platform +// 3D tensor convolutions seem to hang on the HIP platform + test_3d_convolution(&context); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, context.device().stream()) == cudaSuccess); - assert(cudaStreamSynchronize(context.device().stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, context.device().stream()) == gpuSuccess); + assert(gpuStreamSynchronize(context.device().stream()) == gpuSuccess); for (int i = 0; i < 39; ++i) { for (int j = 0; j < 49; ++j) { for (int k = 0; k < 69; ++k) { @@ -378,6 +383,9 @@ void test_gpu() { } } } + +#endif + } diff --git a/unsupported/test/cxx11_tensor_gpu.cu b/unsupported/test/cxx11_tensor_gpu.cu index f238ed5be..285441182 100644 --- a/unsupported/test/cxx11_tensor_gpu.cu +++ b/unsupported/test/cxx11_tensor_gpu.cu @@ -9,15 +9,17 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_gpu #define EIGEN_USE_GPU #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + using Eigen::Tensor; -void test_cuda_nullary() { +void test_gpu_nullary() { Tensor<float, 1, 0, int> in1(2); Tensor<float, 1, 0, int> in2(2); in1.setRandom(); @@ -27,12 +29,12 @@ void test_cuda_nullary() { float* d_in1; float* d_in2; - cudaMalloc((void**)(&d_in1), tensor_bytes); - cudaMalloc((void**)(&d_in2), tensor_bytes); - cudaMemcpy(d_in1, in1.data(), tensor_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), tensor_bytes, cudaMemcpyHostToDevice); + gpuMalloc((void**)(&d_in1), tensor_bytes); + gpuMalloc((void**)(&d_in2), tensor_bytes); + gpuMemcpy(d_in1, in1.data(), tensor_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), tensor_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 1, 0, int>, Eigen::Aligned> gpu_in1( @@ -46,23 +48,23 @@ void test_cuda_nullary() { Tensor<float, 1, 0, int> new1(2); Tensor<float, 1, 0, int> new2(2); - assert(cudaMemcpyAsync(new1.data(), d_in1, tensor_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaMemcpyAsync(new2.data(), d_in2, tensor_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(new1.data(), d_in1, tensor_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuMemcpyAsync(new2.data(), d_in2, tensor_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 2; ++i) { VERIFY_IS_APPROX(new1(i), 3.14f); VERIFY_IS_NOT_EQUAL(new2(i), in2(i)); } - cudaFree(d_in1); - cudaFree(d_in2); + gpuFree(d_in1); + gpuFree(d_in2); } -void test_cuda_elementwise_small() { +void test_gpu_elementwise_small() { Tensor<float, 1> in1(Eigen::array<Eigen::DenseIndex, 1>(2)); Tensor<float, 1> in2(Eigen::array<Eigen::DenseIndex, 1>(2)); Tensor<float, 1> out(Eigen::array<Eigen::DenseIndex, 1>(2)); @@ -76,14 +78,14 @@ void test_cuda_elementwise_small() { float* d_in1; float* d_in2; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_in2), in2_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( @@ -95,9 +97,9 @@ void test_cuda_elementwise_small() { gpu_out.device(gpu_device) = gpu_in1 + gpu_in2; - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 2; ++i) { VERIFY_IS_APPROX( @@ -105,12 +107,12 @@ void test_cuda_elementwise_small() { in1(Eigen::array<Eigen::DenseIndex, 1>(i)) + in2(Eigen::array<Eigen::DenseIndex, 1>(i))); } - cudaFree(d_in1); - cudaFree(d_in2); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_in2); + gpuFree(d_out); } -void test_cuda_elementwise() +void test_gpu_elementwise() { Tensor<float, 3> in1(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); Tensor<float, 3> in2(Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); @@ -129,16 +131,16 @@ void test_cuda_elementwise() float* d_in2; float* d_in3; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_in2), in2_bytes); - cudaMalloc((void**)(&d_in3), in3_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_in2), in2_bytes); + gpuMalloc((void**)(&d_in3), in3_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in2, in2.data(), in2_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in3, in3.data(), in3_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in2, in2.data(), in2_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in3, in3.data(), in3_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 3> > gpu_in1(d_in1, Eigen::array<Eigen::DenseIndex, 3>(72,53,97)); @@ -148,8 +150,8 @@ void test_cuda_elementwise() gpu_out.device(gpu_device) = gpu_in1 + gpu_in2 * gpu_in3; - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 53; ++j) { @@ -159,13 +161,13 @@ void test_cuda_elementwise() } } - cudaFree(d_in1); - cudaFree(d_in2); - cudaFree(d_in3); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_in2); + gpuFree(d_in3); + gpuFree(d_out); } -void test_cuda_props() { +void test_gpu_props() { Tensor<float, 1> in1(200); Tensor<bool, 1> out(200); in1.setRandom(); @@ -175,12 +177,12 @@ void test_cuda_props() { float* d_in1; bool* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 1>, Eigen::Aligned> gpu_in1( @@ -190,19 +192,19 @@ void test_cuda_props() { gpu_out.device(gpu_device) = (gpu_in1.isnan)(); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 200; ++i) { VERIFY_IS_EQUAL(out(i), (std::isnan)(in1(i))); } - cudaFree(d_in1); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_out); } -void test_cuda_reduction() +void test_gpu_reduction() { Tensor<float, 4> in1(72,53,97,113); Tensor<float, 2> out(72,97); @@ -213,12 +215,12 @@ void test_cuda_reduction() float* d_in1; float* d_out; - cudaMalloc((void**)(&d_in1), in1_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in1), in1_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in1, in1.data(), in1_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in1, in1.data(), in1_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4> > gpu_in1(d_in1, 72,53,97,113); @@ -230,8 +232,8 @@ void test_cuda_reduction() gpu_out.device(gpu_device) = gpu_in1.maximum(reduction_axis); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -246,12 +248,12 @@ void test_cuda_reduction() } } - cudaFree(d_in1); - cudaFree(d_out); + gpuFree(d_in1); + gpuFree(d_out); } template<int DataLayout> -void test_cuda_contraction() +void test_gpu_contraction() { // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on @@ -271,14 +273,14 @@ void test_cuda_contraction() float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_t_left(d_t_left, 6, 50, 3, 31); @@ -298,7 +300,7 @@ void test_cuda_contraction() m_result = m_left * m_right; gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); - cudaMemcpy(t_result.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); for (DenseIndex i = 0; i < t_result.size(); i++) { if (fabs(t_result.data()[i] - m_result.data()[i]) >= 1e-4f) { @@ -307,13 +309,13 @@ void test_cuda_contraction() } } - cudaFree(d_t_left); - cudaFree(d_t_right); - cudaFree(d_t_result); + gpuFree(d_t_left); + gpuFree(d_t_right); + gpuFree(d_t_result); } template<int DataLayout> -void test_cuda_convolution_1d() +void test_gpu_convolution_1d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 1, DataLayout> kernel(4); @@ -328,14 +330,14 @@ void test_cuda_convolution_1d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input, 74,37,11,137); @@ -345,8 +347,8 @@ void test_cuda_convolution_1d() Eigen::array<Eigen::DenseIndex, 1> dims(1); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 74; ++i) { for (int j = 0; j < 34; ++j) { @@ -361,12 +363,12 @@ void test_cuda_convolution_1d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } -void test_cuda_convolution_inner_dim_col_major_1d() +void test_gpu_convolution_inner_dim_col_major_1d() { Tensor<float, 4, ColMajor> input(74,9,11,7); Tensor<float, 1, ColMajor> kernel(4); @@ -381,14 +383,14 @@ void test_cuda_convolution_inner_dim_col_major_1d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, ColMajor> > gpu_input(d_input,74,9,11,7); @@ -398,8 +400,8 @@ void test_cuda_convolution_inner_dim_col_major_1d() Eigen::array<Eigen::DenseIndex, 1> dims(0); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 71; ++i) { for (int j = 0; j < 9; ++j) { @@ -414,12 +416,12 @@ void test_cuda_convolution_inner_dim_col_major_1d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } -void test_cuda_convolution_inner_dim_row_major_1d() +void test_gpu_convolution_inner_dim_row_major_1d() { Tensor<float, 4, RowMajor> input(7,9,11,74); Tensor<float, 1, RowMajor> kernel(4); @@ -434,14 +436,14 @@ void test_cuda_convolution_inner_dim_row_major_1d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, RowMajor> > gpu_input(d_input, 7,9,11,74); @@ -451,8 +453,8 @@ void test_cuda_convolution_inner_dim_row_major_1d() Eigen::array<Eigen::DenseIndex, 1> dims(3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 7; ++i) { for (int j = 0; j < 9; ++j) { @@ -467,13 +469,13 @@ void test_cuda_convolution_inner_dim_row_major_1d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } template<int DataLayout> -void test_cuda_convolution_2d() +void test_gpu_convolution_2d() { Tensor<float, 4, DataLayout> input(74,37,11,137); Tensor<float, 2, DataLayout> kernel(3,4); @@ -488,14 +490,14 @@ void test_cuda_convolution_2d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout> > gpu_input(d_input,74,37,11,137); @@ -505,8 +507,8 @@ void test_cuda_convolution_2d() Eigen::array<Eigen::DenseIndex, 2> dims(1,2); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 74; ++i) { for (int j = 0; j < 35; ++j) { @@ -531,13 +533,13 @@ void test_cuda_convolution_2d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } template<int DataLayout> -void test_cuda_convolution_3d() +void test_gpu_convolution_3d() { Tensor<float, 5, DataLayout> input(Eigen::array<Eigen::DenseIndex, 5>(74,37,11,137,17)); Tensor<float, 3, DataLayout> kernel(3,4,2); @@ -552,14 +554,14 @@ void test_cuda_convolution_3d() float* d_input; float* d_kernel; float* d_out; - cudaMalloc((void**)(&d_input), input_bytes); - cudaMalloc((void**)(&d_kernel), kernel_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_input), input_bytes); + gpuMalloc((void**)(&d_kernel), kernel_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_input, input.data(), input_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_kernel, kernel.data(), kernel_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_input, input.data(), input_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_kernel, kernel.data(), kernel_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 5, DataLayout> > gpu_input(d_input,74,37,11,137,17); @@ -569,8 +571,8 @@ void test_cuda_convolution_3d() Eigen::array<Eigen::DenseIndex, 3> dims(1,2,3); gpu_out.device(gpu_device) = gpu_input.convolve(gpu_kernel, dims); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 74; ++i) { for (int j = 0; j < 35; ++j) { @@ -609,14 +611,14 @@ void test_cuda_convolution_3d() } } - cudaFree(d_input); - cudaFree(d_kernel); - cudaFree(d_out); + gpuFree(d_input); + gpuFree(d_kernel); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_lgamma(const Scalar stddev) +void test_gpu_lgamma(const Scalar stddev) { Tensor<Scalar, 2> in(72,97); in.setRandom(); @@ -628,12 +630,12 @@ void test_cuda_lgamma(const Scalar stddev) Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); @@ -641,8 +643,8 @@ void test_cuda_lgamma(const Scalar stddev) gpu_out.device(gpu_device) = gpu_in.lgamma(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -650,12 +652,12 @@ void test_cuda_lgamma(const Scalar stddev) } } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_digamma() +void test_gpu_digamma() { Tensor<Scalar, 1> in(7); Tensor<Scalar, 1> out(7); @@ -682,12 +684,12 @@ void test_cuda_digamma() Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 7); @@ -695,8 +697,8 @@ void test_cuda_digamma() gpu_out.device(gpu_device) = gpu_in.digamma(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 5; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); @@ -705,12 +707,12 @@ void test_cuda_digamma() VERIFY_IS_EQUAL(out(i), expected_out(i)); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_zeta() +void test_gpu_zeta() { Tensor<Scalar, 1> in_x(6); Tensor<Scalar, 1> in_q(6); @@ -744,14 +746,14 @@ void test_cuda_zeta() Scalar* d_in_x; Scalar* d_in_q; Scalar* d_out; - cudaMalloc((void**)(&d_in_x), bytes); - cudaMalloc((void**)(&d_in_q), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in_x), bytes); + gpuMalloc((void**)(&d_in_q), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_q, in_q.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_q, in_q.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 6); @@ -760,8 +762,8 @@ void test_cuda_zeta() gpu_out.device(gpu_device) = gpu_in_x.zeta(gpu_in_q); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(out(0), expected_out(0)); VERIFY((std::isnan)(out(3))); @@ -772,13 +774,13 @@ void test_cuda_zeta() } } - cudaFree(d_in_x); - cudaFree(d_in_q); - cudaFree(d_out); + gpuFree(d_in_x); + gpuFree(d_in_q); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_polygamma() +void test_gpu_polygamma() { Tensor<Scalar, 1> in_x(7); Tensor<Scalar, 1> in_n(7); @@ -815,14 +817,14 @@ void test_cuda_polygamma() Scalar* d_in_x; Scalar* d_in_n; Scalar* d_out; - cudaMalloc((void**)(&d_in_x), bytes); - cudaMalloc((void**)(&d_in_n), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in_x), bytes); + gpuMalloc((void**)(&d_in_n), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_n, in_n.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_n, in_n.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 7); @@ -831,20 +833,20 @@ void test_cuda_polygamma() gpu_out.device(gpu_device) = gpu_in_n.polygamma(gpu_in_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 7; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } - cudaFree(d_in_x); - cudaFree(d_in_n); - cudaFree(d_out); + gpuFree(d_in_x); + gpuFree(d_in_n); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_igamma() +void test_gpu_igamma() { Tensor<Scalar, 2> a(6, 6); Tensor<Scalar, 2> x(6, 6); @@ -880,14 +882,14 @@ void test_cuda_igamma() Scalar* d_a; Scalar* d_x; Scalar* d_out; - assert(cudaMalloc((void**)(&d_a), bytes) == cudaSuccess); - assert(cudaMalloc((void**)(&d_x), bytes) == cudaSuccess); - assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); + assert(gpuMalloc((void**)(&d_a), bytes) == gpuSuccess); + assert(gpuMalloc((void**)(&d_x), bytes) == gpuSuccess); + assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess); - cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); @@ -896,8 +898,8 @@ void test_cuda_igamma() gpu_out.device(gpu_device) = gpu_a.igamma(gpu_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { @@ -909,13 +911,13 @@ void test_cuda_igamma() } } - cudaFree(d_a); - cudaFree(d_x); - cudaFree(d_out); + gpuFree(d_a); + gpuFree(d_x); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_igammac() +void test_gpu_igammac() { Tensor<Scalar, 2> a(6, 6); Tensor<Scalar, 2> x(6, 6); @@ -950,14 +952,14 @@ void test_cuda_igammac() Scalar* d_a; Scalar* d_x; Scalar* d_out; - cudaMalloc((void**)(&d_a), bytes); - cudaMalloc((void**)(&d_x), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_a), bytes); + gpuMalloc((void**)(&d_x), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_a, a.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_x, x.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_a, a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_x, x.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_a(d_a, 6, 6); @@ -966,8 +968,8 @@ void test_cuda_igammac() gpu_out.device(gpu_device) = gpu_a.igammac(gpu_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 6; ++i) { for (int j = 0; j < 6; ++j) { @@ -979,13 +981,13 @@ void test_cuda_igammac() } } - cudaFree(d_a); - cudaFree(d_x); - cudaFree(d_out); + gpuFree(d_a); + gpuFree(d_x); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_erf(const Scalar stddev) +void test_gpu_erf(const Scalar stddev) { Tensor<Scalar, 2> in(72,97); in.setRandom(); @@ -997,12 +999,12 @@ void test_cuda_erf(const Scalar stddev) Scalar* d_in; Scalar* d_out; - assert(cudaMalloc((void**)(&d_in), bytes) == cudaSuccess); - assert(cudaMalloc((void**)(&d_out), bytes) == cudaSuccess); + assert(gpuMalloc((void**)(&d_in), bytes) == gpuSuccess); + assert(gpuMalloc((void**)(&d_out), bytes) == gpuSuccess); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); @@ -1010,8 +1012,8 @@ void test_cuda_erf(const Scalar stddev) gpu_out.device(gpu_device) = gpu_in.erf(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -1019,12 +1021,12 @@ void test_cuda_erf(const Scalar stddev) } } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_erfc(const Scalar stddev) +void test_gpu_erfc(const Scalar stddev) { Tensor<Scalar, 2> in(72,97); in.setRandom(); @@ -1036,12 +1038,12 @@ void test_cuda_erfc(const Scalar stddev) Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 2> > gpu_in(d_in, 72, 97); @@ -1049,8 +1051,8 @@ void test_cuda_erfc(const Scalar stddev) gpu_out.device(gpu_device) = gpu_in.erfc(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 72; ++i) { for (int j = 0; j < 97; ++j) { @@ -1058,12 +1060,12 @@ void test_cuda_erfc(const Scalar stddev) } } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_betainc() +void test_gpu_betainc() { Tensor<Scalar, 1> in_x(125); Tensor<Scalar, 1> in_a(125); @@ -1172,16 +1174,16 @@ void test_cuda_betainc() Scalar* d_in_a; Scalar* d_in_b; Scalar* d_out; - cudaMalloc((void**)(&d_in_x), bytes); - cudaMalloc((void**)(&d_in_a), bytes); - cudaMalloc((void**)(&d_in_b), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in_x), bytes); + gpuMalloc((void**)(&d_in_a), bytes); + gpuMalloc((void**)(&d_in_b), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in_x, in_x.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_a, in_a.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_in_b, in_b.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in_x, in_x.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_a, in_a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_in_b, in_b.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in_x(d_in_x, 125); @@ -1191,8 +1193,8 @@ void test_cuda_betainc() gpu_out.device(gpu_device) = betainc(gpu_in_a, gpu_in_b, gpu_in_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 1; i < 125; ++i) { if ((std::isnan)(expected_out(i))) { @@ -1202,14 +1204,14 @@ void test_cuda_betainc() } } - cudaFree(d_in_x); - cudaFree(d_in_a); - cudaFree(d_in_b); - cudaFree(d_out); + gpuFree(d_in_x); + gpuFree(d_in_a); + gpuFree(d_in_b); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_i0e() +void test_gpu_i0e() { Tensor<Scalar, 1> in_x(21); Tensor<Scalar, 1> out(21); @@ -1238,12 +1240,12 @@ void test_cuda_i0e() Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in_x.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21); @@ -1251,20 +1253,20 @@ void test_cuda_i0e() gpu_out.device(gpu_device) = gpu_in.i0e(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 21; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_i1e() +void test_gpu_i1e() { Tensor<Scalar, 1> in_x(21); Tensor<Scalar, 1> out(21); @@ -1293,12 +1295,12 @@ void test_cuda_i1e() Scalar* d_in; Scalar* d_out; - cudaMalloc((void**)(&d_in), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_in), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_in, in_x.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in_x.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_in(d_in, 21); @@ -1306,20 +1308,20 @@ void test_cuda_i1e() gpu_out.device(gpu_device) = gpu_in.i1e(); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 21; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_igamma_der_a() +void test_gpu_igamma_der_a() { Tensor<Scalar, 1> in_x(30); Tensor<Scalar, 1> in_a(30); @@ -1365,14 +1367,14 @@ void test_cuda_igamma_der_a() Scalar* d_a; Scalar* d_x; Scalar* d_out; - cudaMalloc((void**)(&d_a), bytes); - cudaMalloc((void**)(&d_x), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_a), bytes); + gpuMalloc((void**)(&d_x), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_a, in_a.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_x, in_x.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_a, in_a.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_x, in_x.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_a(d_a, 30); @@ -1381,21 +1383,21 @@ void test_cuda_igamma_der_a() gpu_out.device(gpu_device) = gpu_a.igamma_der_a(gpu_x); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 30; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } - cudaFree(d_a); - cudaFree(d_x); - cudaFree(d_out); + gpuFree(d_a); + gpuFree(d_x); + gpuFree(d_out); } template <typename Scalar> -void test_cuda_gamma_sample_der_alpha() +void test_gpu_gamma_sample_der_alpha() { Tensor<Scalar, 1> in_alpha(30); Tensor<Scalar, 1> in_sample(30); @@ -1441,14 +1443,14 @@ void test_cuda_gamma_sample_der_alpha() Scalar* d_alpha; Scalar* d_sample; Scalar* d_out; - cudaMalloc((void**)(&d_alpha), bytes); - cudaMalloc((void**)(&d_sample), bytes); - cudaMalloc((void**)(&d_out), bytes); + gpuMalloc((void**)(&d_alpha), bytes); + gpuMalloc((void**)(&d_sample), bytes); + gpuMalloc((void**)(&d_out), bytes); - cudaMemcpy(d_alpha, in_alpha.data(), bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_sample, in_sample.data(), bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_alpha, in_alpha.data(), bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_sample, in_sample.data(), bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<Scalar, 1> > gpu_alpha(d_alpha, 30); @@ -1457,101 +1459,115 @@ void test_cuda_gamma_sample_der_alpha() gpu_out.device(gpu_device) = gpu_alpha.gamma_sample_der_alpha(gpu_sample); - assert(cudaMemcpyAsync(out.data(), d_out, bytes, cudaMemcpyDeviceToHost, - gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, bytes, gpuMemcpyDeviceToHost, + gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (int i = 0; i < 30; ++i) { VERIFY_IS_APPROX(out(i), expected_out(i)); } - cudaFree(d_alpha); - cudaFree(d_sample); - cudaFree(d_out); + gpuFree(d_alpha); + gpuFree(d_sample); + gpuFree(d_out); } -void test_cxx11_tensor_cuda() +void test_cxx11_tensor_gpu() { - CALL_SUBTEST_1(test_cuda_nullary()); - CALL_SUBTEST_1(test_cuda_elementwise_small()); - CALL_SUBTEST_1(test_cuda_elementwise()); - CALL_SUBTEST_1(test_cuda_props()); - CALL_SUBTEST_1(test_cuda_reduction()); - CALL_SUBTEST_2(test_cuda_contraction<ColMajor>()); - CALL_SUBTEST_2(test_cuda_contraction<RowMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_1d<ColMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_1d<RowMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_inner_dim_col_major_1d()); - CALL_SUBTEST_3(test_cuda_convolution_inner_dim_row_major_1d()); - CALL_SUBTEST_3(test_cuda_convolution_2d<ColMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_2d<RowMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_3d<ColMajor>()); - CALL_SUBTEST_3(test_cuda_convolution_3d<RowMajor>()); + CALL_SUBTEST_1(test_gpu_nullary()); + CALL_SUBTEST_1(test_gpu_elementwise_small()); + CALL_SUBTEST_1(test_gpu_elementwise()); + CALL_SUBTEST_1(test_gpu_props()); + CALL_SUBTEST_1(test_gpu_reduction()); + CALL_SUBTEST_2(test_gpu_contraction<ColMajor>()); + CALL_SUBTEST_2(test_gpu_contraction<RowMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_1d<ColMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_1d<RowMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_inner_dim_col_major_1d()); + CALL_SUBTEST_3(test_gpu_convolution_inner_dim_row_major_1d()); + CALL_SUBTEST_3(test_gpu_convolution_2d<ColMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_2d<RowMajor>()); +#if !defined(EIGEN_USE_HIP) +// disable these tests on HIP for now. +// they hang..need to investigate and fix + CALL_SUBTEST_3(test_gpu_convolution_3d<ColMajor>()); + CALL_SUBTEST_3(test_gpu_convolution_3d<RowMajor>()); +#endif #if __cplusplus > 199711L // std::erf, std::erfc, and so on where only added in c++11. We use them // as a golden reference to validate the results produced by Eigen. Therefore // we can only run these tests if we use a c++11 compiler. - CALL_SUBTEST_4(test_cuda_lgamma<float>(1.0f)); - CALL_SUBTEST_4(test_cuda_lgamma<float>(100.0f)); - CALL_SUBTEST_4(test_cuda_lgamma<float>(0.01f)); - CALL_SUBTEST_4(test_cuda_lgamma<float>(0.001f)); - - CALL_SUBTEST_4(test_cuda_lgamma<double>(1.0)); - CALL_SUBTEST_4(test_cuda_lgamma<double>(100.0)); - CALL_SUBTEST_4(test_cuda_lgamma<double>(0.01)); - CALL_SUBTEST_4(test_cuda_lgamma<double>(0.001)); - - CALL_SUBTEST_4(test_cuda_erf<float>(1.0f)); - CALL_SUBTEST_4(test_cuda_erf<float>(100.0f)); - CALL_SUBTEST_4(test_cuda_erf<float>(0.01f)); - CALL_SUBTEST_4(test_cuda_erf<float>(0.001f)); - - CALL_SUBTEST_4(test_cuda_erfc<float>(1.0f)); - // CALL_SUBTEST(test_cuda_erfc<float>(100.0f)); - CALL_SUBTEST_4(test_cuda_erfc<float>(5.0f)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST_4(test_cuda_erfc<float>(0.01f)); - CALL_SUBTEST_4(test_cuda_erfc<float>(0.001f)); - - CALL_SUBTEST_4(test_cuda_erf<double>(1.0)); - CALL_SUBTEST_4(test_cuda_erf<double>(100.0)); - CALL_SUBTEST_4(test_cuda_erf<double>(0.01)); - CALL_SUBTEST_4(test_cuda_erf<double>(0.001)); - - CALL_SUBTEST_4(test_cuda_erfc<double>(1.0)); - // CALL_SUBTEST(test_cuda_erfc<double>(100.0)); - CALL_SUBTEST_4(test_cuda_erfc<double>(5.0)); // CUDA erfc lacks precision for large inputs - CALL_SUBTEST_4(test_cuda_erfc<double>(0.01)); - CALL_SUBTEST_4(test_cuda_erfc<double>(0.001)); - - CALL_SUBTEST_5(test_cuda_digamma<float>()); - CALL_SUBTEST_5(test_cuda_digamma<double>()); + CALL_SUBTEST_4(test_gpu_lgamma<float>(1.0f)); + CALL_SUBTEST_4(test_gpu_lgamma<float>(100.0f)); + CALL_SUBTEST_4(test_gpu_lgamma<float>(0.01f)); + CALL_SUBTEST_4(test_gpu_lgamma<float>(0.001f)); + + CALL_SUBTEST_4(test_gpu_lgamma<double>(1.0)); + CALL_SUBTEST_4(test_gpu_lgamma<double>(100.0)); + CALL_SUBTEST_4(test_gpu_lgamma<double>(0.01)); + CALL_SUBTEST_4(test_gpu_lgamma<double>(0.001)); + + CALL_SUBTEST_4(test_gpu_erf<float>(1.0f)); + CALL_SUBTEST_4(test_gpu_erf<float>(100.0f)); + CALL_SUBTEST_4(test_gpu_erf<float>(0.01f)); + CALL_SUBTEST_4(test_gpu_erf<float>(0.001f)); + + CALL_SUBTEST_4(test_gpu_erfc<float>(1.0f)); + // CALL_SUBTEST(test_gpu_erfc<float>(100.0f)); + CALL_SUBTEST_4(test_gpu_erfc<float>(5.0f)); // GPU erfc lacks precision for large inputs + CALL_SUBTEST_4(test_gpu_erfc<float>(0.01f)); + CALL_SUBTEST_4(test_gpu_erfc<float>(0.001f)); + + CALL_SUBTEST_4(test_gpu_erf<double>(1.0)); + CALL_SUBTEST_4(test_gpu_erf<double>(100.0)); + CALL_SUBTEST_4(test_gpu_erf<double>(0.01)); + CALL_SUBTEST_4(test_gpu_erf<double>(0.001)); + + CALL_SUBTEST_4(test_gpu_erfc<double>(1.0)); + // CALL_SUBTEST(test_gpu_erfc<double>(100.0)); + CALL_SUBTEST_4(test_gpu_erfc<double>(5.0)); // GPU erfc lacks precision for large inputs + CALL_SUBTEST_4(test_gpu_erfc<double>(0.01)); + CALL_SUBTEST_4(test_gpu_erfc<double>(0.001)); + +#if !defined(EIGEN_USE_HIP) +// disable these tests on HIP for now. + CALL_SUBTEST_5(test_gpu_digamma<float>()); + CALL_SUBTEST_5(test_gpu_digamma<double>()); + + CALL_SUBTEST_5(test_gpu_polygamma<float>()); + CALL_SUBTEST_5(test_gpu_polygamma<double>()); + + CALL_SUBTEST_5(test_gpu_zeta<float>()); + CALL_SUBTEST_5(test_gpu_zeta<double>()); +#endif - CALL_SUBTEST_5(test_cuda_polygamma<float>()); - CALL_SUBTEST_5(test_cuda_polygamma<double>()); + CALL_SUBTEST_5(test_gpu_igamma<float>()); + CALL_SUBTEST_5(test_gpu_igammac<float>()); - CALL_SUBTEST_5(test_cuda_zeta<float>()); - CALL_SUBTEST_5(test_cuda_zeta<double>()); + CALL_SUBTEST_5(test_gpu_igamma<double>()); + CALL_SUBTEST_5(test_gpu_igammac<double>()); - CALL_SUBTEST_5(test_cuda_igamma<float>()); - CALL_SUBTEST_5(test_cuda_igammac<float>()); +#if !defined(EIGEN_USE_HIP) +// disable these tests on HIP for now. + CALL_SUBTEST_6(test_gpu_betainc<float>()); + CALL_SUBTEST_6(test_gpu_betainc<double>()); - CALL_SUBTEST_5(test_cuda_igamma<double>()); - CALL_SUBTEST_5(test_cuda_igammac<double>()); + CALL_SUBTEST_6(test_gpu_i0e<float>()); + CALL_SUBTEST_6(test_gpu_i0e<double>()); - CALL_SUBTEST_6(test_cuda_betainc<float>()); - CALL_SUBTEST_6(test_cuda_betainc<double>()); + CALL_SUBTEST_6(test_gpu_i1e<float>()); + CALL_SUBTEST_6(test_gpu_i1e<double>()); - CALL_SUBTEST_6(test_cuda_i0e<float>()); - CALL_SUBTEST_6(test_cuda_i0e<double>()); + CALL_SUBTEST_6(test_gpu_i1e<float>()); + CALL_SUBTEST_6(test_gpu_i1e<double>()); - CALL_SUBTEST_6(test_cuda_i1e<float>()); - CALL_SUBTEST_6(test_cuda_i1e<double>()); + CALL_SUBTEST_6(test_gpu_igamma_der_a<float>()); + CALL_SUBTEST_6(test_gpu_igamma_der_a<double>()); - CALL_SUBTEST_6(test_cuda_igamma_der_a<float>()); - CALL_SUBTEST_6(test_cuda_igamma_der_a<double>()); + CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<float>()); + CALL_SUBTEST_6(test_gpu_gamma_sample_der_alpha<double>()); +#endif - CALL_SUBTEST_6(test_cuda_gamma_sample_der_alpha<float>()); - CALL_SUBTEST_6(test_cuda_gamma_sample_der_alpha<double>()); #endif } diff --git a/unsupported/test/cxx11_tensor_of_float16_gpu.cu b/unsupported/test/cxx11_tensor_of_float16_gpu.cu index 7a751ff02..150fde8bf 100644 --- a/unsupported/test/cxx11_tensor_of_float16_gpu.cu +++ b/unsupported/test/cxx11_tensor_of_float16_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_of_float16_gpu #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU @@ -20,8 +20,8 @@ using Eigen::Tensor; template<typename> -void test_cuda_numext() { - Eigen::CudaStreamDevice stream; +void test_gpu_numext() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -57,11 +57,11 @@ void test_cuda_numext() { } -#ifdef EIGEN_HAS_CUDA_FP16 +#ifdef EIGEN_HAS_GPU_FP16 template<typename> -void test_cuda_conversion() { - Eigen::CudaStreamDevice stream; +void test_gpu_conversion() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -95,8 +95,8 @@ void test_cuda_conversion() { } template<typename> -void test_cuda_unary() { - Eigen::CudaStreamDevice stream; +void test_gpu_unary() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -132,8 +132,8 @@ void test_cuda_unary() { } template<typename> -void test_cuda_elementwise() { - Eigen::CudaStreamDevice stream; +void test_gpu_elementwise() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -174,8 +174,8 @@ void test_cuda_elementwise() { } template<typename> -void test_cuda_trancendental() { - Eigen::CudaStreamDevice stream; +void test_gpu_trancendental() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -268,8 +268,8 @@ void test_cuda_trancendental() { } template<typename> -void test_cuda_contractions() { - Eigen::CudaStreamDevice stream; +void test_gpu_contractions() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int rows = 23; int cols = 23; @@ -319,12 +319,12 @@ void test_cuda_contractions() { } template<typename> -void test_cuda_reductions(int size1, int size2, int redux) { +void test_gpu_reductions(int size1, int size2, int redux) { std::cout << "Reducing " << size1 << " by " << size2 << " tensor along dim " << redux << std::endl; - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = size1*size2; int result_size = (redux == 1 ? size1 : size2); @@ -368,20 +368,20 @@ void test_cuda_reductions(int size1, int size2, int redux) { } template<typename> -void test_cuda_reductions() { - test_cuda_reductions<void>(13, 13, 0); - test_cuda_reductions<void>(13, 13, 1); +void test_gpu_reductions() { + test_gpu_reductions<void>(13, 13, 0); + test_gpu_reductions<void>(13, 13, 1); - test_cuda_reductions<void>(35, 36, 0); - test_cuda_reductions<void>(35, 36, 1); + test_gpu_reductions<void>(35, 36, 0); + test_gpu_reductions<void>(35, 36, 1); - test_cuda_reductions<void>(36, 35, 0); - test_cuda_reductions<void>(36, 35, 1); + test_gpu_reductions<void>(36, 35, 0); + test_gpu_reductions<void>(36, 35, 1); } template<typename> -void test_cuda_full_reductions() { - Eigen::CudaStreamDevice stream; +void test_gpu_full_reductions() { + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int size = 13; int num_elem = size*size; @@ -429,9 +429,9 @@ void test_cuda_full_reductions() { } template<typename> -void test_cuda_forced_evals() { +void test_gpu_forced_evals() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); int num_elem = 101; @@ -479,20 +479,20 @@ void test_cuda_forced_evals() { #endif -void test_cxx11_tensor_of_float16_cuda() +void test_cxx11_tensor_of_float16_gpu() { - CALL_SUBTEST_1(test_cuda_numext<void>()); - -#ifdef EIGEN_HAS_CUDA_FP16 - CALL_SUBTEST_1(test_cuda_conversion<void>()); - CALL_SUBTEST_1(test_cuda_unary<void>()); - CALL_SUBTEST_1(test_cuda_elementwise<void>()); - CALL_SUBTEST_1(test_cuda_trancendental<void>()); - CALL_SUBTEST_2(test_cuda_contractions<void>()); - CALL_SUBTEST_3(test_cuda_reductions<void>()); - CALL_SUBTEST_4(test_cuda_full_reductions<void>()); - CALL_SUBTEST_5(test_cuda_forced_evals<void>()); + CALL_SUBTEST_1(test_gpu_numext<void>()); + +#ifdef EIGEN_HAS_GPU_FP16 + CALL_SUBTEST_1(test_gpu_conversion<void>()); + CALL_SUBTEST_1(test_gpu_unary<void>()); + CALL_SUBTEST_1(test_gpu_elementwise<void>()); + CALL_SUBTEST_1(test_gpu_trancendental<void>()); + CALL_SUBTEST_2(test_gpu_contractions<void>()); + CALL_SUBTEST_3(test_gpu_reductions<void>()); + CALL_SUBTEST_4(test_gpu_full_reductions<void>()); + CALL_SUBTEST_5(test_gpu_forced_evals<void>()); #else - std::cout << "Half floats are not supported by this version of cuda: skipping the test" << std::endl; + std::cout << "Half floats are not supported by this version of gpu: skipping the test" << std::endl; #endif } diff --git a/unsupported/test/cxx11_tensor_random_gpu.cu b/unsupported/test/cxx11_tensor_random_gpu.cu index 389c0a8c2..da5977f09 100644 --- a/unsupported/test/cxx11_tensor_random_gpu.cu +++ b/unsupported/test/cxx11_tensor_random_gpu.cu @@ -9,15 +9,16 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_random_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_random_gpu #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU #include "main.h" #include <Eigen/CXX11/Tensor> +#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> -void test_cuda_random_uniform() +void test_gpu_random_uniform() { Tensor<float, 2> out(72,97); out.setZero(); @@ -25,24 +26,24 @@ void test_cuda_random_uniform() std::size_t out_bytes = out.size() * sizeof(float); float* d_out; - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); gpu_out.device(gpu_device) = gpu_out.random(); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); // For now we just check this code doesn't crash. // TODO: come up with a valid test of randomness } -void test_cuda_random_normal() +void test_gpu_random_normal() { Tensor<float, 2> out(72,97); out.setZero(); @@ -50,9 +51,9 @@ void test_cuda_random_normal() std::size_t out_bytes = out.size() * sizeof(float); float* d_out; - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 2> > gpu_out(d_out, 72,97); @@ -60,8 +61,8 @@ void test_cuda_random_normal() Eigen::internal::NormalRandomGenerator<float> gen(true); gpu_out.device(gpu_device) = gpu_out.random(gen); - assert(cudaMemcpyAsync(out.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); } static void test_complex() @@ -77,9 +78,9 @@ static void test_complex() } -void test_cxx11_tensor_random_cuda() +void test_cxx11_tensor_random_gpu() { - CALL_SUBTEST(test_cuda_random_uniform()); - CALL_SUBTEST(test_cuda_random_normal()); + CALL_SUBTEST(test_gpu_random_uniform()); + CALL_SUBTEST(test_gpu_random_normal()); CALL_SUBTEST(test_complex()); } diff --git a/unsupported/test/cxx11_tensor_reduction_gpu.cu b/unsupported/test/cxx11_tensor_reduction_gpu.cu index ec0669704..a36759303 100644 --- a/unsupported/test/cxx11_tensor_reduction_gpu.cu +++ b/unsupported/test/cxx11_tensor_reduction_gpu.cu @@ -9,7 +9,7 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_reduction_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_reduction_gpu #define EIGEN_USE_GPU #include "main.h" @@ -19,7 +19,7 @@ template<typename Type, int DataLayout> static void test_full_reductions() { - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); const int num_rows = internal::random<int>(1024, 5*1024); @@ -67,7 +67,7 @@ static void test_first_dim_reductions() { Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); // Create device - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice dev(&stream); // Create data(T) @@ -107,7 +107,7 @@ static void test_last_dim_reductions() { Tensor<Type, 2, DataLayout> redux = in.sum(red_axis); // Create device - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice dev(&stream); // Create data @@ -134,7 +134,7 @@ static void test_last_dim_reductions() { } -void test_cxx11_tensor_reduction_cuda() { +void test_cxx11_tensor_reduction_gpu() { CALL_SUBTEST_1((test_full_reductions<float, ColMajor>())); CALL_SUBTEST_1((test_full_reductions<double, ColMajor>())); CALL_SUBTEST_2((test_full_reductions<float, RowMajor>())); diff --git a/unsupported/test/cxx11_tensor_scan_gpu.cu b/unsupported/test/cxx11_tensor_scan_gpu.cu index 1d4edef11..51cd3a3cf 100644 --- a/unsupported/test/cxx11_tensor_scan_gpu.cu +++ b/unsupported/test/cxx11_tensor_scan_gpu.cu @@ -9,19 +9,20 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_scan_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_scan_gpu #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> using Eigen::Tensor; typedef Tensor<float, 1>::DimensionPair DimPair; template<int DataLayout> -void test_cuda_cumsum(int m_size, int k_size, int n_size) +void test_gpu_cumsum(int m_size, int k_size, int n_size) { std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; Tensor<float, 3, DataLayout> t_input(m_size, k_size, n_size); @@ -36,12 +37,12 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size) float* d_t_input; float* d_t_result; - cudaMalloc((void**)(&d_t_input), t_input_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_input), t_input_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_input, t_input.data(), t_input_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_input, t_input.data(), t_input_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 3, DataLayout> > @@ -52,7 +53,7 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_input.cumsum(1); t_result = t_input.cumsum(1); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); for (DenseIndex i = 0; i < t_result.size(); i++) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { continue; @@ -65,13 +66,13 @@ void test_cuda_cumsum(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_input); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_input); + gpuFree((void*)d_t_result); } -void test_cxx11_tensor_scan_cuda() +void test_cxx11_tensor_scan_gpu() { - CALL_SUBTEST_1(test_cuda_cumsum<ColMajor>(128, 128, 128)); - CALL_SUBTEST_2(test_cuda_cumsum<RowMajor>(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_cumsum<ColMajor>(128, 128, 128)); + CALL_SUBTEST_2(test_gpu_cumsum<RowMajor>(128, 128, 128)); } |