From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- unsupported/test/cxx11_tensor_reverse_sycl.cpp | 128 +++++++++++++++---------- 1 file changed, 80 insertions(+), 48 deletions(-) (limited to 'unsupported/test/cxx11_tensor_reverse_sycl.cpp') diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp index 77c2235d1..dd30c235d 100644 --- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -20,10 +20,8 @@ #include "main.h" #include - template -static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { - +static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { IndexType dim1 = 2; IndexType dim2 = 3; IndexType dim3 = 5; @@ -40,21 +38,30 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[2] = true; dim_rev[3] = false; - DataType* gpu_in_data = static_cast(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =static_cast(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + reversed_tensor.dimensions().TotalSize() * sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, tensorRange); + TensorMap > in_gpu(gpu_in_data, + tensorRange); + TensorMap > out_gpu(gpu_out_data, + tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(i, 2 - j, 4 - k, l)); } } } @@ -65,13 +72,15 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[3] = false; out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l)); } } } @@ -82,13 +91,16 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[2] = false; dim_rev[3] = true; out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(1 - i, j, k, 6 - l)); } } } @@ -98,11 +110,9 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { sycl_device.deallocate(gpu_out_data); } - - template -static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) -{ +static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, + bool LValue) { IndexType dim1 = 2; IndexType dim2 = 3; IndexType dim3 = 5; @@ -120,24 +130,32 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dim_rev[2] = false; dim_rev[3] = true; - DataType* gpu_in_data = static_cast(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_expected =static_cast(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_result =static_cast(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu_expected(gpu_out_data_expected, tensorRange); - TensorMap > out_gpu_result(gpu_out_data_result, tensorRange); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_expected = static_cast(sycl_device.allocate( + expected.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_result = static_cast( + sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType))); + TensorMap > in_gpu(gpu_in_data, + tensorRange); + TensorMap > out_gpu_expected( + gpu_out_data_expected, tensorRange); + TensorMap > out_gpu_result( + gpu_out_data_result, tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); if (LValue) { out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; } else { out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); } - sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); - + sycl_device.memcpyDeviceToHost( + expected.data(), gpu_out_data_expected, + expected.dimensions().TotalSize() * sizeof(DataType)); array src_slice_dim; src_slice_dim[0] = 2; @@ -154,8 +172,9 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue for (IndexType i = 0; i < 5; ++i) { if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(src_slice_start, src_slice_dim); + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim); } else { out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); @@ -163,13 +182,15 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue src_slice_start[2] += 1; dst_slice_start[2] += 1; } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < expected.dimension(0); ++i) { for (IndexType j = 0; j < expected.dimension(1); ++j) { for (IndexType k = 0; k < expected.dimension(2); ++k) { for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); } } } @@ -177,34 +198,37 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dst_slice_start[2] = 0; result.setRandom(); - sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_out_data_result, result.data(), + (result.dimensions().TotalSize()) * sizeof(DataType)); for (IndexType i = 0; i < 5; ++i) { - if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(dst_slice_start, dst_slice_dim); - } else { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = - in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); - } + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } dst_slice_start[2] += 1; } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < expected.dimension(0); ++i) { for (IndexType j = 0; j < expected.dimension(1); ++j) { for (IndexType k = 0; k < expected.dimension(2); ++k) { for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); } } } } } - - -template void sycl_reverse_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info() << std::endl; +template +void sycl_reverse_test_per_device(const cl::sycl::device& d) { QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); test_simple_reverse(sycl_device); @@ -215,7 +239,15 @@ template void sycl_reverse_test_per_device(const cl::sycl::de test_expr_reverse(sycl_device, true); } EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reverse_test_per_device(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.get_info() << std::endl; + CALL_SUBTEST_1(sycl_reverse_test_per_device(device)); + CALL_SUBTEST_2(sycl_reverse_test_per_device(device)); + CALL_SUBTEST_3(sycl_reverse_test_per_device(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST_4(sycl_reverse_test_per_device(device)); +#endif + CALL_SUBTEST_5(sycl_reverse_test_per_device(device)); } } -- cgit v1.2.3