From 1bb6fa99a31d2dcf5431087d3f238e2dcca03084 Mon Sep 17 00:00:00 2001 From: Deven Desai Date: Wed, 20 Jun 2018 16:44:58 -0400 Subject: merging the CUDA and HIP implementation for the Tensor directory and the unit tests --- unsupported/test/cxx11_tensor_argmax_gpu.cu | 90 +++++++++++++++-------------- 1 file changed, 46 insertions(+), 44 deletions(-) (limited to 'unsupported/test/cxx11_tensor_argmax_gpu.cu') diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu index 3d73d491a..541a27865 100644 --- a/unsupported/test/cxx11_tensor_argmax_gpu.cu +++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu @@ -9,16 +9,18 @@ #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_gpu #define EIGEN_USE_GPU #include "main.h" #include +#include + using Eigen::Tensor; template -void test_cuda_simple_argmax() +void test_gpu_simple_argmax() { Tensor in(Eigen::array(72,53,97)); Tensor out_max(Eigen::array(1)); @@ -34,13 +36,13 @@ void test_cuda_simple_argmax() double* d_in; DenseIndex* d_out_max; DenseIndex* d_out_min; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out_max), out_bytes); - cudaMalloc((void**)(&d_out_min), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out_max), out_bytes); + gpuMalloc((void**)(&d_out_min), out_bytes); - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(72,53,97)); @@ -50,20 +52,20 @@ void test_cuda_simple_argmax() gpu_out_max.device(gpu_device) = gpu_in.argmax(); gpu_out_min.device(gpu_device) = gpu_in.argmin(); - assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(out_max(Eigen::array(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array(0)), 0); - cudaFree(d_in); - cudaFree(d_out_max); - cudaFree(d_out_min); + gpuFree(d_in); + gpuFree(d_out_max); + gpuFree(d_out_min); } template -void test_cuda_argmax_dim() +void test_gpu_argmax_dim() { Tensor tensor(2,3,5,7); std::vector dims; @@ -97,12 +99,12 @@ void test_cuda_argmax_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(2, 3, 5, 7)); @@ -110,8 +112,8 @@ void test_cuda_argmax_dim() gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), size_t(2*3*5*7 / tensor.dimension(dim))); @@ -134,25 +136,25 @@ void test_cuda_argmax_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } template -void test_cuda_argmin_dim() +void test_gpu_argmin_dim() { Tensor tensor(2,3,5,7); std::vector dims; @@ -186,12 +188,12 @@ void test_cuda_argmin_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap, Aligned > gpu_in(d_in, Eigen::array(2, 3, 5, 7)); @@ -199,8 +201,8 @@ void test_cuda_argmin_dim() gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), 2*3*5*7 / tensor.dimension(dim)); @@ -223,29 +225,29 @@ void test_cuda_argmin_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } -void test_cxx11_tensor_cuda() +void test_cxx11_tensor_gpu() { - CALL_SUBTEST_1(test_cuda_simple_argmax()); - CALL_SUBTEST_1(test_cuda_simple_argmax()); - CALL_SUBTEST_2(test_cuda_argmax_dim()); - CALL_SUBTEST_2(test_cuda_argmax_dim()); - CALL_SUBTEST_3(test_cuda_argmin_dim()); - CALL_SUBTEST_3(test_cuda_argmin_dim()); + CALL_SUBTEST_1(test_gpu_simple_argmax()); + CALL_SUBTEST_1(test_gpu_simple_argmax()); + CALL_SUBTEST_2(test_gpu_argmax_dim()); + CALL_SUBTEST_2(test_gpu_argmax_dim()); + CALL_SUBTEST_3(test_gpu_argmin_dim()); + CALL_SUBTEST_3(test_gpu_argmin_dim()); } -- cgit v1.2.3