aboutsummaryrefslogtreecommitdiffhomepage
path: root/unsupported/test/cxx11_tensor_contract_gpu.cu
diff options
context:
space:
mode:
authorGravatar Deven Desai <deven.desai.amd@gmail.com>2018-06-20 16:44:58 -0400
committerGravatar Deven Desai <deven.desai.amd@gmail.com>2018-06-20 16:44:58 -0400
commit1bb6fa99a31d2dcf5431087d3f238e2dcca03084 (patch)
treee62d41b8d6430849aea4bf97785a54488bf542d4 /unsupported/test/cxx11_tensor_contract_gpu.cu
parentcfdabbcc8f708c06da2bfa4e924edc25619f013a (diff)
merging the CUDA and HIP implementation for the Tensor directory and the unit tests
Diffstat (limited to 'unsupported/test/cxx11_tensor_contract_gpu.cu')
-rw-r--r--unsupported/test/cxx11_tensor_contract_gpu.cu92
1 files changed, 48 insertions, 44 deletions
diff --git a/unsupported/test/cxx11_tensor_contract_gpu.cu b/unsupported/test/cxx11_tensor_contract_gpu.cu
index 3621e2aa6..061d0464e 100644
--- a/unsupported/test/cxx11_tensor_contract_gpu.cu
+++ b/unsupported/test/cxx11_tensor_contract_gpu.cu
@@ -10,19 +10,20 @@
#define EIGEN_TEST_NO_LONGDOUBLE
#define EIGEN_TEST_NO_COMPLEX
-#define EIGEN_TEST_FUNC cxx11_tensor_cuda
+#define EIGEN_TEST_FUNC cxx11_tensor_gpu
#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
#define EIGEN_USE_GPU
#include "main.h"
#include <unsupported/Eigen/CXX11/Tensor>
+#include <unsupported/Eigen/CXX11/src/Tensor/TensorGpuHipCudaDefines.h>
using Eigen::Tensor;
typedef Tensor<float, 1>::DimensionPair DimPair;
template<int DataLayout>
-void test_cuda_contraction(int m_size, int k_size, int n_size)
+void test_gpu_contraction(int m_size, int k_size, int n_size)
{
std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl;
// with these dimensions, the output has 300 * 140 elements, which is
@@ -45,14 +46,14 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
float* d_t_right;
float* d_t_result;
- cudaMalloc((void**)(&d_t_left), t_left_bytes);
- cudaMalloc((void**)(&d_t_right), t_right_bytes);
- cudaMalloc((void**)(&d_t_result), t_result_bytes);
+ gpuMalloc((void**)(&d_t_left), t_left_bytes);
+ gpuMalloc((void**)(&d_t_right), t_right_bytes);
+ gpuMalloc((void**)(&d_t_result), t_result_bytes);
- cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -66,7 +67,7 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
- cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+ gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
for (DenseIndex i = 0; i < t_result.size(); i++) {
if (fabs(t_result(i) - t_result_gpu(i)) < 1e-4f) {
continue;
@@ -79,9 +80,9 @@ void test_cuda_contraction(int m_size, int k_size, int n_size)
assert(false);
}
- cudaFree((void*)d_t_left);
- cudaFree((void*)d_t_right);
- cudaFree((void*)d_t_result);
+ gpuFree((void*)d_t_left);
+ gpuFree((void*)d_t_right);
+ gpuFree((void*)d_t_result);
}
@@ -109,14 +110,14 @@ void test_scalar(int m_size, int k_size, int n_size)
float* d_t_right;
float* d_t_result;
- cudaMalloc((void**)(&d_t_left), t_left_bytes);
- cudaMalloc((void**)(&d_t_right), t_right_bytes);
- cudaMalloc((void**)(&d_t_result), t_result_bytes);
+ gpuMalloc((void**)(&d_t_left), t_left_bytes);
+ gpuMalloc((void**)(&d_t_right), t_right_bytes);
+ gpuMalloc((void**)(&d_t_result), t_result_bytes);
- cudaMemcpy(d_t_left, t_left.data(), t_left_bytes, cudaMemcpyHostToDevice);
- cudaMemcpy(d_t_right, t_right.data(), t_right_bytes, cudaMemcpyHostToDevice);
+ gpuMemcpy(d_t_left, t_left.data(), t_left_bytes, gpuMemcpyHostToDevice);
+ gpuMemcpy(d_t_right, t_right.data(), t_right_bytes, gpuMemcpyHostToDevice);
- Eigen::CudaStreamDevice stream;
+ Eigen::GpuStreamDevice stream;
Eigen::GpuDevice gpu_device(&stream);
Eigen::TensorMap<Eigen::Tensor<float, 2, DataLayout> >
@@ -129,7 +130,7 @@ void test_scalar(int m_size, int k_size, int n_size)
gpu_t_result.device(gpu_device) = gpu_t_left.contract(gpu_t_right, dims);
t_result = t_left.contract(t_right, dims);
- cudaMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, cudaMemcpyDeviceToHost);
+ gpuMemcpy(t_result_gpu.data(), d_t_result, t_result_bytes, gpuMemcpyDeviceToHost);
if (fabs(t_result() - t_result_gpu()) > 1e-4f &&
!Eigen::internal::isApprox(t_result(), t_result_gpu(), 1e-4f)) {
std::cout << "mismatch detected: " << t_result()
@@ -137,39 +138,39 @@ void test_scalar(int m_size, int k_size, int n_size)
assert(false);
}
- cudaFree((void*)d_t_left);
- cudaFree((void*)d_t_right);
- cudaFree((void*)d_t_result);
+ gpuFree((void*)d_t_left);
+ gpuFree((void*)d_t_right);
+ gpuFree((void*)d_t_result);
}
template<int DataLayout>
-void test_cuda_contraction_m() {
+void test_gpu_contraction_m() {
for (int k = 32; k < 256; k++) {
- test_cuda_contraction<ColMajor>(k, 128, 128);
- test_cuda_contraction<RowMajor>(k, 128, 128);
+ test_gpu_contraction<ColMajor>(k, 128, 128);
+ test_gpu_contraction<RowMajor>(k, 128, 128);
}
}
template<int DataLayout>
-void test_cuda_contraction_k() {
+void test_gpu_contraction_k() {
for (int k = 32; k < 256; k++) {
- test_cuda_contraction<ColMajor>(128, k, 128);
- test_cuda_contraction<RowMajor>(128, k, 128);
+ test_gpu_contraction<ColMajor>(128, k, 128);
+ test_gpu_contraction<RowMajor>(128, k, 128);
}
}
template<int DataLayout>
-void test_cuda_contraction_n() {
+void test_gpu_contraction_n() {
for (int k = 32; k < 256; k++) {
- test_cuda_contraction<ColMajor>(128, 128, k);
- test_cuda_contraction<RowMajor>(128, 128, k);
+ test_gpu_contraction<ColMajor>(128, 128, k);
+ test_gpu_contraction<RowMajor>(128, 128, k);
}
}
template<int DataLayout>
-void test_cuda_contraction_sizes() {
+void test_gpu_contraction_sizes() {
int m_sizes[] = { 31, 39, 63, 64, 65,
127, 129, 255, 257 , 511,
512, 513, 1023, 1024, 1025};
@@ -186,29 +187,32 @@ void test_cuda_contraction_sizes() {
for (int i = 0; i < 15; i++) {
for (int j = 0; j < 15; j++) {
for (int k = 0; k < 17; k++) {
- test_cuda_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
+ test_gpu_contraction<DataLayout>(m_sizes[i], n_sizes[j], k_sizes[k]);
}
}
}
}
-void test_cxx11_tensor_cuda()
+void test_cxx11_tensor_gpu()
{
- CALL_SUBTEST_1(test_cuda_contraction<ColMajor>(128, 128, 128));
- CALL_SUBTEST_1(test_cuda_contraction<RowMajor>(128, 128, 128));
+ CALL_SUBTEST_1(test_gpu_contraction<ColMajor>(128, 128, 128));
+ CALL_SUBTEST_1(test_gpu_contraction<RowMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<ColMajor>(128, 128, 128));
CALL_SUBTEST_1(test_scalar<RowMajor>(128, 128, 128));
- CALL_SUBTEST_2(test_cuda_contraction_m<ColMajor>());
- CALL_SUBTEST_3(test_cuda_contraction_m<RowMajor>());
+ CALL_SUBTEST_2(test_gpu_contraction_m<ColMajor>());
+ CALL_SUBTEST_3(test_gpu_contraction_m<RowMajor>());
- CALL_SUBTEST_4(test_cuda_contraction_k<ColMajor>());
- CALL_SUBTEST_5(test_cuda_contraction_k<RowMajor>());
+ CALL_SUBTEST_4(test_gpu_contraction_k<ColMajor>());
+ CALL_SUBTEST_5(test_gpu_contraction_k<RowMajor>());
- CALL_SUBTEST_6(test_cuda_contraction_n<ColMajor>());
- CALL_SUBTEST_7(test_cuda_contraction_n<RowMajor>());
+ CALL_SUBTEST_6(test_gpu_contraction_n<ColMajor>());
+ CALL_SUBTEST_7(test_gpu_contraction_n<RowMajor>());
- CALL_SUBTEST_8(test_cuda_contraction_sizes<ColMajor>());
- CALL_SUBTEST_9(test_cuda_contraction_sizes<RowMajor>());
+#if !defined(EIGEN_USE_HIP)
+// disable these subtests for HIP
+ CALL_SUBTEST_8(test_gpu_contraction_sizes<ColMajor>());
+ CALL_SUBTEST_9(test_gpu_contraction_sizes<RowMajor>());
+#endif
}