diff options
author | Deven Desai <deven.desai.amd@gmail.com> | 2018-06-20 16:44:58 -0400 |
---|---|---|
committer | Deven Desai <deven.desai.amd@gmail.com> | 2018-06-20 16:44:58 -0400 |
commit | 1bb6fa99a31d2dcf5431087d3f238e2dcca03084 (patch) | |
tree | e62d41b8d6430849aea4bf97785a54488bf542d4 /unsupported/test/cxx11_tensor_argmax_gpu.cu | |
parent | cfdabbcc8f708c06da2bfa4e924edc25619f013a (diff) |
merging the CUDA and HIP implementation for the Tensor directory and the unit tests
Diffstat (limited to 'unsupported/test/cxx11_tensor_argmax_gpu.cu')
-rw-r--r-- | unsupported/test/cxx11_tensor_argmax_gpu.cu | 90 |
1 files changed, 46 insertions, 44 deletions
diff --git a/unsupported/test/cxx11_tensor_argmax_gpu.cu b/unsupported/test/cxx11_tensor_argmax_gpu.cu index 3d73d491a..541a27865 100644 --- a/unsupported/test/cxx11_tensor_argmax_gpu.cu +++ b/unsupported/test/cxx11_tensor_argmax_gpu.cu @@ -9,16 +9,18 @@ #define EIGEN_TEST_NO_LONGDOUBLE -#define EIGEN_TEST_FUNC cxx11_tensor_cuda +#define EIGEN_TEST_FUNC cxx11_tensor_gpu #define EIGEN_USE_GPU #include "main.h" #include <unsupported/Eigen/CXX11/Tensor> +#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h> + using Eigen::Tensor; template <int Layout> -void test_cuda_simple_argmax() +void test_gpu_simple_argmax() { Tensor<double, 3, Layout> in(Eigen::array<DenseIndex, 3>(72,53,97)); Tensor<DenseIndex, 1, Layout> out_max(Eigen::array<DenseIndex, 1>(1)); @@ -34,13 +36,13 @@ void test_cuda_simple_argmax() double* d_in; DenseIndex* d_out_max; DenseIndex* d_out_min; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out_max), out_bytes); - cudaMalloc((void**)(&d_out_min), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out_max), out_bytes); + gpuMalloc((void**)(&d_out_min), out_bytes); - cudaMemcpy(d_in, in.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, in.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<double, 3, Layout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 3>(72,53,97)); @@ -50,20 +52,20 @@ void test_cuda_simple_argmax() gpu_out_max.device(gpu_device) = gpu_in.argmax(); gpu_out_min.device(gpu_device) = gpu_in.argmin(); - assert(cudaMemcpyAsync(out_max.data(), d_out_max, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaMemcpyAsync(out_min.data(), d_out_min, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(out_max.data(), d_out_max, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuMemcpyAsync(out_min.data(), d_out_min, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(out_max(Eigen::array<DenseIndex, 1>(0)), 72*53*97 - 1); VERIFY_IS_EQUAL(out_min(Eigen::array<DenseIndex, 1>(0)), 0); - cudaFree(d_in); - cudaFree(d_out_max); - cudaFree(d_out_min); + gpuFree(d_in); + gpuFree(d_out_max); + gpuFree(d_out_min); } template <int DataLayout> -void test_cuda_argmax_dim() +void test_gpu_argmax_dim() { Tensor<float, 4, DataLayout> tensor(2,3,5,7); std::vector<int> dims; @@ -97,12 +99,12 @@ void test_cuda_argmax_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); @@ -110,8 +112,8 @@ void test_cuda_argmax_dim() gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), size_t(2*3*5*7 / tensor.dimension(dim))); @@ -134,25 +136,25 @@ void test_cuda_argmax_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmax(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } template <int DataLayout> -void test_cuda_argmin_dim() +void test_gpu_argmin_dim() { Tensor<float, 4, DataLayout> tensor(2,3,5,7); std::vector<int> dims; @@ -186,12 +188,12 @@ void test_cuda_argmin_dim() float* d_in; DenseIndex* d_out; - cudaMalloc((void**)(&d_in), in_bytes); - cudaMalloc((void**)(&d_out), out_bytes); + gpuMalloc((void**)(&d_in), in_bytes); + gpuMalloc((void**)(&d_out), out_bytes); - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); - Eigen::CudaStreamDevice stream; + Eigen::GpuStreamDevice stream; Eigen::GpuDevice gpu_device(&stream); Eigen::TensorMap<Eigen::Tensor<float, 4, DataLayout>, Aligned > gpu_in(d_in, Eigen::array<DenseIndex, 4>(2, 3, 5, 7)); @@ -199,8 +201,8 @@ void test_cuda_argmin_dim() gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); VERIFY_IS_EQUAL(tensor_arg.size(), 2*3*5*7 / tensor.dimension(dim)); @@ -223,29 +225,29 @@ void test_cuda_argmin_dim() } } - cudaMemcpy(d_in, tensor.data(), in_bytes, cudaMemcpyHostToDevice); + gpuMemcpy(d_in, tensor.data(), in_bytes, gpuMemcpyHostToDevice); gpu_out.device(gpu_device) = gpu_in.argmin(dim); - assert(cudaMemcpyAsync(tensor_arg.data(), d_out, out_bytes, cudaMemcpyDeviceToHost, gpu_device.stream()) == cudaSuccess); - assert(cudaStreamSynchronize(gpu_device.stream()) == cudaSuccess); + assert(gpuMemcpyAsync(tensor_arg.data(), d_out, out_bytes, gpuMemcpyDeviceToHost, gpu_device.stream()) == gpuSuccess); + assert(gpuStreamSynchronize(gpu_device.stream()) == gpuSuccess); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the last index of the reduced dimension VERIFY_IS_EQUAL(tensor_arg.data()[n], tensor.dimension(dim) - 1); } - cudaFree(d_in); - cudaFree(d_out); + gpuFree(d_in); + gpuFree(d_out); } } -void test_cxx11_tensor_cuda() +void test_cxx11_tensor_gpu() { - CALL_SUBTEST_1(test_cuda_simple_argmax<RowMajor>()); - CALL_SUBTEST_1(test_cuda_simple_argmax<ColMajor>()); - CALL_SUBTEST_2(test_cuda_argmax_dim<RowMajor>()); - CALL_SUBTEST_2(test_cuda_argmax_dim<ColMajor>()); - CALL_SUBTEST_3(test_cuda_argmin_dim<RowMajor>()); - CALL_SUBTEST_3(test_cuda_argmin_dim<ColMajor>()); + CALL_SUBTEST_1(test_gpu_simple_argmax<RowMajor>()); + CALL_SUBTEST_1(test_gpu_simple_argmax<ColMajor>()); + CALL_SUBTEST_2(test_gpu_argmax_dim<RowMajor>()); + CALL_SUBTEST_2(test_gpu_argmax_dim<ColMajor>()); + CALL_SUBTEST_3(test_gpu_argmin_dim<RowMajor>()); + CALL_SUBTEST_3(test_gpu_argmin_dim<ColMajor>()); } |