From 1bb6fa99a31d2dcf5431087d3f238e2dcca03084 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 20 Jun 2018 16:44:58 -0400 Subject: merging the CUDA and HIP implementation for the Tensor directory and the unit tests --- unsupported/test/cxx11_tensor_contract_gpu.cu | 92 ++++++++++++++------------- 1 file changed, 48 insertions(+), 44 deletions(-) (limited to 'unsupported/test/cxx11_tensor_contract_gpu.cu') diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu index 3621e2aa6..061d0464e 100644 --- a/unsupported/test/cxx11_tensor_contract_gpu.cu +++ b/unsupported/test/cxx11_tensor_contract_gpu.cu @@ -10,19 +10,20 @@ #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX -#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_gpu #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int #define EIGEN_USE_GPU #include "main.h" #include +#include using Eigen::Tensor; typedef Tensor::DimensionPair DimPair; template -void test_cuda_contraction(int m_size, int k_size, int n_size) +void test_gpu_contraction(int m_size, int k_size, int n_size) { std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is @@ -45,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > @@ -66,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); t_result = t_left.contract(t_right, dims); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); for (DenseIndex i = 0; i < t_result.size(); i++) { if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) { continue; @@ -79,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_left); - cudaFree((void*)d_t_right); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_left); + gpuFree((void*)d_t_right); + gpuFree((void*)d_t_result); } @@ -109,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size) float* d_t_right; float* d_t_result; - cudaMalloc((void**)(&d_t_left), t_left_bytes); - cudaMalloc((void**)(&d_t_right), t_right_bytes); - cudaMalloc((void**)(&d_t_result), t_result_bytes); + gpuMalloc((void**)(&d_t_left), t_left_bytes); + gpuMalloc((void**)(&d_t_right), t_right_bytes); + gpuMalloc((void**)(&d_t_result), t_result_bytes); - cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice); - cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice); + gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap > @@ -129,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size) gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims); t_result = t_left.contract(t_right, dims); - cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost); + gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost); if (fabs(t_result() - t_result_gpu()) > 1e-4f && !Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) { std::cout << "mismatch detected: " << t_result() @@ -137,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size) assert(false); } - cudaFree((void*)d_t_left); - cudaFree((void*)d_t_right); - cudaFree((void*)d_t_result); + gpuFree((void*)d_t_left); + gpuFree((void*)d_t_right); + gpuFree((void*)d_t_result); } template -void test_cuda_contraction_m() { +void test_gpu_contraction_m() { for (int k = 32; k < 256; k++) { - test_cuda_contraction(k, 128, 128); - test_cuda_contraction(k, 128, 128); + test_gpu_contraction(k, 128, 128); + test_gpu_contraction(k, 128, 128); } } template -void test_cuda_contraction_k() { +void test_gpu_contraction_k() { for (int k = 32; k < 256; k++) { - test_cuda_contraction(128, k, 128); - test_cuda_contraction(128, k, 128); + test_gpu_contraction(128, k, 128); + test_gpu_contraction(128, k, 128); } } template -void test_cuda_contraction_n() { +void test_gpu_contraction_n() { for (int k = 32; k < 256; k++) { - test_cuda_contraction(128, 128, k); - test_cuda_contraction(128, 128, k); + test_gpu_contraction(128, 128, k); + test_gpu_contraction(128, 128, k); } } template -void test_cuda_contraction_sizes() { +void test_gpu_contraction_sizes() { int m_sizes[] = { 31, 39, 63, 64, 65, 127, 129, 255, 257 , 511, 512, 513, 1023, 1024, 1025}; @@ -186,29 +187,32 @@ void test_cuda_contraction_sizes() { for (int i = 0; i < 15; i++) { for (int j = 0; j < 15; j++) { for (int k = 0; k < 17; k++) { - test_cuda_contraction(m_sizes[i], n_sizes[j], k_sizes[k]); + test_gpu_contraction(m_sizes[i], n_sizes[j], k_sizes[k]); } } } } -void test_cxx11_tensor_cuda() +void test_cxx11_tensor_gpu() { - CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); - CALL_SUBTEST_1(test_cuda_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_contraction(128, 128, 128)); + CALL_SUBTEST_1(test_gpu_contraction(128, 128, 128)); CALL_SUBTEST_1(test_scalar(128, 128, 128)); CALL_SUBTEST_1(test_scalar(128, 128, 128)); - CALL_SUBTEST_2(test_cuda_contraction_m()); - CALL_SUBTEST_3(test_cuda_contraction_m()); + CALL_SUBTEST_2(test_gpu_contraction_m()); + CALL_SUBTEST_3(test_gpu_contraction_m()); - CALL_SUBTEST_4(test_cuda_contraction_k()); - CALL_SUBTEST_5(test_cuda_contraction_k()); + CALL_SUBTEST_4(test_gpu_contraction_k()); + CALL_SUBTEST_5(test_gpu_contraction_k()); - CALL_SUBTEST_6(test_cuda_contraction_n()); - CALL_SUBTEST_7(test_cuda_contraction_n()); + CALL_SUBTEST_6(test_gpu_contraction_n()); + CALL_SUBTEST_7(test_gpu_contraction_n()); - CALL_SUBTEST_8(test_cuda_contraction_sizes()); - CALL_SUBTEST_9(test_cuda_contraction_sizes()); +#if !defined(EIGEN_USE_HIP) +// disable these subtests for HIP + CALL_SUBTEST_8(test_gpu_contraction_sizes()); + CALL_SUBTEST_9(test_gpu_contraction_sizes()); +#endif } -- cgit v1.2.3