From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- unsupported/test/CMakeLists.txt | 129 ++- unsupported/test/cxx11_tensor_argmax_sycl.cpp | 136 +-- unsupported/test/cxx11_tensor_builtins_sycl.cpp | 497 ++++++---- unsupported/test/cxx11_tensor_chipping_sycl.cpp | 7 +- unsupported/test/cxx11_tensor_contract_sycl.cpp | 1010 +++++++++++++++++--- unsupported/test/cxx11_tensor_custom_op_sycl.cpp | 5 + unsupported/test/cxx11_tensor_forced_eval_sycl.cpp | 5 +- unsupported/test/cxx11_tensor_image_op_sycl.cpp | 103 ++ unsupported/test/cxx11_tensor_math_sycl.cpp | 105 ++ unsupported/test/cxx11_tensor_morphing_sycl.cpp | 138 +++ unsupported/test/cxx11_tensor_random_sycl.cpp | 100 ++ unsupported/test/cxx11_tensor_reduction_sycl.cpp | 941 ++++++++++++++++-- unsupported/test/cxx11_tensor_reverse_sycl.cpp | 128 ++- unsupported/test/cxx11_tensor_scan_sycl.cpp | 141 +++ unsupported/test/cxx11_tensor_shuffling_sycl.cpp | 52 +- unsupported/test/cxx11_tensor_sycl.cpp | 91 +- 16 files changed, 3019 insertions(+), 569 deletions(-) create mode 100644 unsupported/test/cxx11_tensor_image_op_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_math_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_random_sycl.cpp create mode 100644 unsupported/test/cxx11_tensor_scan_sycl.cpp (limited to 'unsupported/test') diff --git a/unsupported/test/CMakeLists.txt b/unsupported/test/CMakeLists.txt index 3d9ac9263..9db965ad8 100644 --- a/unsupported/test/CMakeLists.txt +++ b/unsupported/test/CMakeLists.txt @@ -111,40 +111,113 @@ ei_add_test(special_functions) if(EIGEN_TEST_CXX11) if(EIGEN_TEST_SYCL) + set(EIGEN_SYCL ON) + # Forward CMake options as preprocessor definitions + if(EIGEN_SYCL_USE_DEFAULT_SELECTOR) + add_definitions(-DEIGEN_SYCL_USE_DEFAULT_SELECTOR=${EIGEN_SYCL_USE_DEFAULT_SELECTOR}) + endif() + if(EIGEN_SYCL_NO_LOCAL_MEM) + add_definitions(-DEIGEN_SYCL_NO_LOCAL_MEM=${EIGEN_SYCL_NO_LOCAL_MEM}) + endif() + if(EIGEN_SYCL_LOCAL_MEM) + add_definitions(-DEIGEN_SYCL_LOCAL_MEM=${EIGEN_SYCL_LOCAL_MEM}) + endif() + if(EIGEN_SYCL_MAX_GLOBAL_RANGE) + add_definitions(-DEIGEN_SYCL_MAX_GLOBAL_RANGE=${EIGEN_SYCL_MAX_GLOBAL_RANGE}) + endif() + if(EIGEN_SYCL_LOCAL_THREAD_DIM0) + add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM0=${EIGEN_SYCL_LOCAL_THREAD_DIM0}) + endif() + if(EIGEN_SYCL_LOCAL_THREAD_DIM1) + add_definitions(-DEIGEN_SYCL_LOCAL_THREAD_DIM1=${EIGEN_SYCL_LOCAL_THREAD_DIM1}) + endif() + if(EIGEN_SYCL_REG_M) + add_definitions(-DEIGEN_SYCL_REG_M=${EIGEN_SYCL_REG_M}) + endif() + if(EIGEN_SYCL_REG_N) + add_definitions(-DEIGEN_SYCL_REG_N=${EIGEN_SYCL_REG_N}) + endif() + if(EIGEN_SYCL_USE_PROGRAM_CLASS) + add_definitions(-DEIGEN_SYCL_USE_PROGRAM_CLASS=${EIGEN_SYCL_USE_PROGRAM_CLASS}) + endif() + if(EIGEN_SYCL_ASYNC_EXECUTION) + add_definitions(-DEIGEN_SYCL_ASYNC_EXECUTION=${EIGEN_SYCL_ASYNC_EXECUTION}) + endif() + if(EIGEN_SYCL_DISABLE_SKINNY) + add_definitions(-DEIGEN_SYCL_DISABLE_SKINNY=${EIGEN_SYCL_DISABLE_SKINNY}) + endif() + if(EIGEN_SYCL_DISABLE_DOUBLE_BUFFER) + add_definitions(-DEIGEN_SYCL_DISABLE_DOUBLE_BUFFER=${EIGEN_SYCL_DISABLE_DOUBLE_BUFFER}) + endif() + if(EIGEN_SYCL_DISABLE_RANK1) + add_definitions(-DEIGEN_SYCL_DISABLE_RANK1=${EIGEN_SYCL_DISABLE_RANK1}) + endif() + if(EIGEN_SYCL_DISABLE_SCALAR) + add_definitions(-DEIGEN_SYCL_DISABLE_SCALAR=${EIGEN_SYCL_DISABLE_SCALAR}) + endif() + if(EIGEN_SYCL_DISABLE_GEMV) + add_definitions(-DEIGEN_SYCL_DISABLE_GEMV=${EIGEN_SYCL_DISABLE_GEMV}) + endif() + if(EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION) + add_definitions(-DEIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION=${EIGEN_SYCL_DISABLE_ARM_GPU_CACHE_OPTIMISATION}) + endif() + if(EIGEN_SYCL_TRISYCL) set(CMAKE_CXX_STANDARD 14) set(STD_CXX_FLAG "-std=c++1z") else() - # It should be safe to always run these tests as there is some fallback code for - # older compiler that don't support cxx11. - # This is already set if EIGEN_TEST_CXX11 is enabled: - # set(CMAKE_CXX_STANDARD 11) - # set(STD_CXX_FLAG "-std=c++11") + if(MSVC) + # Set the host and device compilers C++ standard to C++14. On Windows setting this to C++11 + # can cause issues with the ComputeCpp device compiler parsing Visual Studio Headers. + set(CMAKE_CXX_STANDARD 14) + list(APPEND COMPUTECPP_USER_FLAGS -DWIN32) + else() + set(CMAKE_CXX_STANDARD 11) + list(APPEND COMPUTECPP_USER_FLAGS -Wall) + endif() + # The following flags are not supported by Clang and can cause warnings + # if used with -Werror so they are removed here. + if(COMPUTECPP_USE_COMPILER_DRIVER) + set(CMAKE_CXX_COMPILER ${ComputeCpp_DEVICE_COMPILER_EXECUTABLE}) + string(REPLACE "-Wlogical-op" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-Wno-psabi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + string(REPLACE "-ansi" "" CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) + endif() + list(APPEND COMPUTECPP_USER_FLAGS + -DEIGEN_NO_ASSERTION_CHECKING=1 + -no-serial-memop + -Xclang + -cl-mad-enable) endif() - ei_add_test_sycl(cxx11_tensor_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_device_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_padding_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_contract_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_striding_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_generator_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_patch_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG}) - ei_add_test_sycl(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_image_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_math_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_forced_eval_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_broadcast_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_device_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_reduction_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_morphing_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_shuffling_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_padding_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_builtins_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_contract_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_concatenation_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_reverse_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_convolution_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_striding_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_chipping_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_layout_swap_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_inflation_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_random_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_generator_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_image_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_volume_patch_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_argmax_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_custom_op_sycl ${STD_CXX_FLAG}) + ei_add_test(cxx11_tensor_scan_sycl ${STD_CXX_FLAG}) + set(EIGEN_SYCL OFF) endif() ei_add_test(cxx11_eventcount "-pthread" "${CMAKE_THREAD_LIBS_INIT}") diff --git a/unsupported/test/cxx11_tensor_argmax_sycl.cpp b/unsupported/test/cxx11_tensor_argmax_sycl.cpp index 0bbb0f6dc..41ea3cf7b 100644 --- a/unsupported/test/cxx11_tensor_argmax_sycl.cpp +++ b/unsupported/test/cxx11_tensor_argmax_sycl.cpp @@ -18,6 +18,7 @@ #define EIGEN_USE_SYCL #include "main.h" + #include using Eigen::array; @@ -26,9 +27,8 @@ using Eigen::Tensor; using Eigen::TensorMap; template -static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ - - Tensor in(Eigen::array{{2,2,2}}); +static void test_sycl_simple_argmax(const Eigen::SyclDevice& sycl_device) { + Tensor in(Eigen::array{{2, 2, 2}}); Tensor out_max; Tensor out_min; in.setRandom(); @@ -39,14 +39,15 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ std::size_t in_bytes = in.size() * sizeof(DataType); std::size_t out_bytes = out_max.size() * sizeof(DenseIndex); - DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); + DataType* d_in = static_cast(sycl_device.allocate(in_bytes)); DenseIndex* d_out_max = static_cast(sycl_device.allocate(out_bytes)); DenseIndex* d_out_min = static_cast(sycl_device.allocate(out_bytes)); - Eigen::TensorMap > gpu_in(d_in, Eigen::array{{2,2,2}}); + Eigen::TensorMap > gpu_in(d_in, + Eigen::array{{2, 2, 2}}); Eigen::TensorMap > gpu_out_max(d_out_max); Eigen::TensorMap > gpu_out_min(d_out_min); - sycl_device.memcpyHostToDevice(d_in, in.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, in.data(), in_bytes); gpu_out_max.device(sycl_device) = gpu_in.argmax(); gpu_out_min.device(sycl_device) = gpu_in.argmin(); @@ -54,7 +55,7 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ sycl_device.memcpyDeviceToHost(out_max.data(), d_out_max, out_bytes); sycl_device.memcpyDeviceToHost(out_min.data(), d_out_min, out_bytes); - VERIFY_IS_EQUAL(out_max(), 2*2*2 - 1); + VERIFY_IS_EQUAL(out_max(), 2 * 2 * 2 - 1); VERIFY_IS_EQUAL(out_min(), 0); sycl_device.deallocate(d_in); @@ -62,22 +63,22 @@ static void test_sycl_simple_argmax(const Eigen::SyclDevice &sycl_device){ sycl_device.deallocate(d_out_min); } - template -static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) -{ - DenseIndex sizeDim0=9; - DenseIndex sizeDim1=3; - DenseIndex sizeDim2=5; - DenseIndex sizeDim3=7; - Tensor tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); +static void test_sycl_argmax_dim(const Eigen::SyclDevice& sycl_device) { + DenseIndex sizeDim0 = 9; + DenseIndex sizeDim1 = 3; + DenseIndex sizeDim2 = 5; + DenseIndex sizeDim3 = 7; + Tensor tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3); std::vector dims; - dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + dims.push_back(sizeDim0); + dims.push_back(sizeDim1); + dims.push_back(sizeDim2); + dims.push_back(sizeDim3); for (DenseIndex dim = 0; dim < 4; ++dim) { - array out_shape; - for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1]; Tensor tensor_arg(out_shape); @@ -86,9 +87,13 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix)=(ix[dim] != 0)?-1.0:10.0; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) + // = 10.0 + tensor(ix) = (ix[dim] != 0) ? -1.0 : 10.0; } } } @@ -97,23 +102,23 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) std::size_t in_bytes = tensor.size() * sizeof(DataType); std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + DataType* d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out = static_cast(sycl_device.allocate(out_bytes)); - DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); - DenseIndex* d_out= static_cast(sycl_device.allocate(out_bytes)); - - Eigen::TensorMap > gpu_in(d_in, Eigen::array{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap > gpu_in( + d_in, Eigen::array{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}}); Eigen::TensorMap > gpu_out(d_out, out_shape); - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmax(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); VERIFY_IS_EQUAL(static_cast(tensor_arg.size()), - size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim))); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); } sycl_device.synchronize(); @@ -122,15 +127,18 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?-1.0:20.0; + tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? -1.0 : 20.0; } } } } - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmax(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); @@ -144,20 +152,21 @@ static void test_sycl_argmax_dim(const Eigen::SyclDevice &sycl_device) } template -static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) -{ - DenseIndex sizeDim0=9; - DenseIndex sizeDim1=3; - DenseIndex sizeDim2=5; - DenseIndex sizeDim3=7; - Tensor tensor(sizeDim0,sizeDim1,sizeDim2,sizeDim3); +static void test_sycl_argmin_dim(const Eigen::SyclDevice& sycl_device) { + DenseIndex sizeDim0 = 9; + DenseIndex sizeDim1 = 3; + DenseIndex sizeDim2 = 5; + DenseIndex sizeDim3 = 7; + Tensor tensor(sizeDim0, sizeDim1, sizeDim2, sizeDim3); std::vector dims; - dims.push_back(sizeDim0); dims.push_back(sizeDim1); dims.push_back(sizeDim2); dims.push_back(sizeDim3); + dims.push_back(sizeDim0); + dims.push_back(sizeDim1); + dims.push_back(sizeDim2); + dims.push_back(sizeDim3); for (DenseIndex dim = 0; dim < 4; ++dim) { - array out_shape; - for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d+1]; + for (DenseIndex d = 0; d < 3; ++d) out_shape[d] = (d < dim) ? dims[d] : dims[d + 1]; Tensor tensor_arg(out_shape); @@ -166,9 +175,12 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = 10.0 - tensor(ix)=(ix[dim] != 0)?1.0:-10.0; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 0, k, l) = -10.0 + tensor(ix) = (ix[dim] != 0) ? 1.0 : -10.0; } } } @@ -177,23 +189,23 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) std::size_t in_bytes = tensor.size() * sizeof(DataType); std::size_t out_bytes = tensor_arg.size() * sizeof(DenseIndex); + DataType* d_in = static_cast(sycl_device.allocate(in_bytes)); + DenseIndex* d_out = static_cast(sycl_device.allocate(out_bytes)); - DataType * d_in = static_cast(sycl_device.allocate(in_bytes)); - DenseIndex* d_out= static_cast(sycl_device.allocate(out_bytes)); - - Eigen::TensorMap > gpu_in(d_in, Eigen::array{{sizeDim0,sizeDim1,sizeDim2,sizeDim3}}); + Eigen::TensorMap > gpu_in( + d_in, Eigen::array{{sizeDim0, sizeDim1, sizeDim2, sizeDim3}}); Eigen::TensorMap > gpu_out(d_out, out_shape); - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmin(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); VERIFY_IS_EQUAL(static_cast(tensor_arg.size()), - size_t(sizeDim0*sizeDim1*sizeDim2*sizeDim3 / tensor.dimension(dim))); + size_t(sizeDim0 * sizeDim1 * sizeDim2 * sizeDim3 / tensor.dimension(dim))); for (DenseIndex n = 0; n < tensor_arg.size(); ++n) { // Expect max to be in the first index of the reduced dimension - VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); + VERIFY_IS_EQUAL(tensor_arg.data()[n], 0); } sycl_device.synchronize(); @@ -202,15 +214,18 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) for (DenseIndex j = 0; j < sizeDim1; ++j) { for (DenseIndex k = 0; k < sizeDim2; ++k) { for (DenseIndex l = 0; l < sizeDim3; ++l) { - ix[0] = i; ix[1] = j; ix[2] = k; ix[3] = l; - // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = 20.0 - tensor(ix)=(ix[dim] != tensor.dimension(dim) - 1)?1.0:-20.0; + ix[0] = i; + ix[1] = j; + ix[2] = k; + ix[3] = l; + // suppose dim == 1, then for all i, k, l, set tensor(i, 2, k, l) = -20.0 + tensor(ix) = (ix[dim] != tensor.dimension(dim) - 1) ? 1.0 : -20.0; } } } } - sycl_device.memcpyHostToDevice(d_in, tensor.data(),in_bytes); + sycl_device.memcpyHostToDevice(d_in, tensor.data(), in_bytes); gpu_out.device(sycl_device) = gpu_in.argmin(dim); sycl_device.memcpyDeviceToHost(tensor_arg.data(), d_out, out_bytes); @@ -223,10 +238,8 @@ static void test_sycl_argmin_dim(const Eigen::SyclDevice &sycl_device) } } - - - -template void sycl_argmax_test_per_device(const Device_Selector& d){ +template +void sycl_argmax_test_per_device(const Device_Selector& d) { QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); test_sycl_simple_argmax(sycl_device); @@ -238,8 +251,7 @@ template void sycl_argmax_test_per_ } EIGEN_DECLARE_TEST(cxx11_tensor_argmax_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_argmax_test_per_device(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_argmax_test_per_device(device)); } - } diff --git a/unsupported/test/cxx11_tensor_builtins_sycl.cpp b/unsupported/test/cxx11_tensor_builtins_sycl.cpp index db2975783..72cb62fd5 100644 --- a/unsupported/test/cxx11_tensor_builtins_sycl.cpp +++ b/unsupported/test/cxx11_tensor_builtins_sycl.cpp @@ -25,243 +25,330 @@ using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -namespace std { -template T rsqrt(T x) { return 1 / std::sqrt(x); } +// Functions used to compare the TensorMap implementation on the device with +// the equivalent on the host +namespace cl { +namespace sycl { +template T abs(T x) { return cl::sycl::fabs(x); } template T square(T x) { return x * x; } template T cube(T x) { return x * x * x; } -template T inverse(T x) { return 1 / x; } +template T inverse(T x) { return T(1) / x; } +template T cwiseMax(T x, T y) { return cl::sycl::max(x, y); } +template T cwiseMin(T x, T y) { return cl::sycl::min(x, y); } } +} + +struct EqualAssignement { + template + void operator()(Lhs& lhs, const Rhs& rhs) { lhs = rhs; } +}; + +struct PlusEqualAssignement { + template + void operator()(Lhs& lhs, const Rhs& rhs) { lhs += rhs; } +}; -#define TEST_UNARY_BUILTINS_FOR_SCALAR(FUNC, SCALAR, OPERATOR, Layout) \ - { \ - /* out OPERATOR in.FUNC() */ \ - Tensor in(tensorRange); \ - Tensor out(tensorRange); \ - in = in.random() + static_cast(0.01); \ - out = out.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data = static_cast( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu(gpu_data, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(in(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ - } \ - { \ - /* out OPERATOR out.FUNC() */ \ - Tensor out(tensorRange); \ - out = out.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), \ - (out.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) OPERATOR gpu_out.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver OPERATOR std::FUNC(reference(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_out); \ +template +void test_unary_builtins_for_scalar(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + Operator op; + Assignement asgn; + { + /* Assignement(out, Operator(in)) */ + Tensor in(tensor_range); + Tensor out(tensor_range); + in = in.random() + DataType(0.01); + out = out.random() + DataType(0.01); + Tensor reference(out); + DataType *gpu_data = static_cast( + sycl_device.allocate(in.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu(gpu_data, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(in(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); } + { + /* Assignement(out, Operator(out)) */ + Tensor out(tensor_range); + out = out.random() + DataType(0.01); + Tensor reference(out); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_out, out.data(), + (out.size()) * sizeof(DataType)); + auto device_expr = gpu_out.device(sycl_device); + asgn(device_expr, op(gpu_out)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + DataType ver = reference(i); + asgn(ver, op(reference(i))); + VERIFY_IS_APPROX(out(i), ver); + } + sycl_device.deallocate(gpu_data_out); + } +} + +#define DECLARE_UNARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template \ + auto operator()(const T& x) -> decltype(cl::sycl::FUNC(x)) { \ + return cl::sycl::FUNC(x); \ + } \ + template \ + auto operator()(const TensorMap& x) -> decltype(x.FUNC()) { \ + return x.FUNC(); \ + } \ + }; -#define TEST_UNARY_BUILTINS_OPERATOR(SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(sqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(rsqrt, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(square, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(cube, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(inverse, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(tanh, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(exp, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(expm1, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(abs, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(ceil, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(floor, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(round, SCALAR, OPERATOR , Layout) \ - TEST_UNARY_BUILTINS_FOR_SCALAR(log1p, SCALAR, OPERATOR , Layout) +DECLARE_UNARY_STRUCT(abs) +DECLARE_UNARY_STRUCT(sqrt) +DECLARE_UNARY_STRUCT(rsqrt) +DECLARE_UNARY_STRUCT(square) +DECLARE_UNARY_STRUCT(cube) +DECLARE_UNARY_STRUCT(inverse) +DECLARE_UNARY_STRUCT(tanh) +DECLARE_UNARY_STRUCT(exp) +DECLARE_UNARY_STRUCT(expm1) +DECLARE_UNARY_STRUCT(log) +DECLARE_UNARY_STRUCT(ceil) +DECLARE_UNARY_STRUCT(floor) +DECLARE_UNARY_STRUCT(round) +DECLARE_UNARY_STRUCT(log1p) +DECLARE_UNARY_STRUCT(sign) +DECLARE_UNARY_STRUCT(isnan) +DECLARE_UNARY_STRUCT(isfinite) +DECLARE_UNARY_STRUCT(isinf) -#define TEST_IS_THAT_RETURNS_BOOL(SCALAR, FUNC, Layout) \ - { \ - /* out = in.FUNC() */ \ - Tensor in(tensorRange); \ - Tensor out(tensorRange); \ - in = in.random() + static_cast(0.01); \ - SCALAR *gpu_data = static_cast( \ - sycl_device.allocate(in.size() * sizeof(SCALAR))); \ - bool *gpu_data_out = \ - static_cast(sycl_device.allocate(out.size() * sizeof(bool))); \ - TensorMap> gpu(gpu_data, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data, in.data(), \ - (in.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu.FUNC(); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(bool)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_EQUAL(out(i), std::FUNC(in(i))); \ - } \ - sycl_device.deallocate(gpu_data); \ - sycl_device.deallocate(gpu_data_out); \ +template +void test_unary_builtins_for_assignement(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { +#define RUN_UNARY_TEST(FUNC) \ + test_unary_builtins_for_scalar(sycl_device, tensor_range) + RUN_UNARY_TEST(abs); + RUN_UNARY_TEST(sqrt); + RUN_UNARY_TEST(rsqrt); + RUN_UNARY_TEST(square); + RUN_UNARY_TEST(cube); + RUN_UNARY_TEST(inverse); + RUN_UNARY_TEST(tanh); + RUN_UNARY_TEST(exp); + RUN_UNARY_TEST(expm1); + RUN_UNARY_TEST(log); + RUN_UNARY_TEST(ceil); + RUN_UNARY_TEST(floor); + RUN_UNARY_TEST(round); + RUN_UNARY_TEST(log1p); + RUN_UNARY_TEST(sign); +} + +template +void test_unary_builtins_return_bool(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + /* out = op(in) */ + Operator op; + Tensor in(tensor_range); + Tensor out(tensor_range); + in = in.random() + DataType(0.01); + DataType *gpu_data = static_cast( + sycl_device.allocate(in.size() * sizeof(DataType))); + bool *gpu_data_out = + static_cast(sycl_device.allocate(out.size() * sizeof(bool))); + TensorMap> gpu(gpu_data, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data, in.data(), + (in.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(bool)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_EQUAL(out(i), op(in(i))); } + sycl_device.deallocate(gpu_data); + sycl_device.deallocate(gpu_data_out); +} -#define TEST_UNARY_BUILTINS(SCALAR, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, +=, Layout) \ - TEST_UNARY_BUILTINS_OPERATOR(SCALAR, =, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isnan, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isfinite, Layout) \ - TEST_IS_THAT_RETURNS_BOOL(SCALAR, isinf, Layout) +template +void test_unary_builtins(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + test_unary_builtins_for_assignement(sycl_device, tensor_range); + test_unary_builtins_for_assignement(sycl_device, tensor_range); + test_unary_builtins_return_bool(sycl_device, tensor_range); + test_unary_builtins_return_bool(sycl_device, tensor_range); + test_unary_builtins_return_bool(sycl_device, tensor_range); +} +template static void test_builtin_unary_sycl(const Eigen::SyclDevice &sycl_device) { int64_t sizeDim1 = 10; int64_t sizeDim2 = 10; int64_t sizeDim3 = 10; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + array tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_UNARY_BUILTINS(float, RowMajor) - TEST_UNARY_BUILTINS(float, ColMajor) + test_unary_builtins(sycl_device, tensor_range); + test_unary_builtins(sycl_device, tensor_range); } -namespace std { -template T cwiseMax(T x, T y) { return std::max(x, y); } -template T cwiseMin(T x, T y) { return std::min(x, y); } +template +void test_binary_builtins_func(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + /* out = op(in_1, in_2) */ + Operator op; + Tensor in_1(tensor_range); + Tensor in_2(tensor_range); + Tensor out(tensor_range); + in_1 = in_1.random() + DataType(0.01); + in_2 = in_2.random() + DataType(0.01); + Tensor reference(out); + DataType *gpu_data_1 = static_cast( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_2 = static_cast( + sycl_device.allocate(in_2.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu_1(gpu_data_1, tensor_range); + TensorMap> gpu_2(gpu_data_2, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), + (in_2.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, gpu_2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), in_2(i))); + } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_2); + sycl_device.deallocate(gpu_data_out); } -#define TEST_BINARY_BUILTINS_FUNC(SCALAR, FUNC, Layout) \ - { \ - /* out = in_1.FUNC(in_2) */ \ - Tensor in_1(tensorRange); \ - Tensor in_2(tensorRange); \ - Tensor out(tensorRange); \ - in_1 = in_1.random() + static_cast(0.01); \ - in_2 = in_2.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_1 = static_cast( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_2(gpu_data_2, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1.FUNC(gpu_2); \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - SCALAR ver = reference(i); \ - ver = std::FUNC(in_1(i), in_2(i)); \ - VERIFY_IS_APPROX(out(i), ver); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ +template +void test_binary_builtins_fixed_arg2(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + /* out = op(in_1, 2) */ + Operator op; + const DataType arg2(2); + Tensor in_1(tensor_range); + Tensor out(tensor_range); + in_1 = in_1.random(); + Tensor reference(out); + DataType *gpu_data_1 = static_cast( + sycl_device.allocate(in_1.size() * sizeof(DataType))); + DataType *gpu_data_out = static_cast( + sycl_device.allocate(out.size() * sizeof(DataType))); + TensorMap> gpu_1(gpu_data_1, tensor_range); + TensorMap> gpu_out(gpu_data_out, tensor_range); + sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), + (in_1.size()) * sizeof(DataType)); + gpu_out.device(sycl_device) = op(gpu_1, arg2); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, + (out.size()) * sizeof(DataType)); + for (int64_t i = 0; i < out.size(); ++i) { + VERIFY_IS_APPROX(out(i), op(in_1(i), arg2)); } + sycl_device.deallocate(gpu_data_1); + sycl_device.deallocate(gpu_data_out); +} -#define TEST_BINARY_BUILTINS_OPERATORS(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR in_2 */ \ - Tensor in_1(tensorRange); \ - Tensor in_2(tensorRange); \ - Tensor out(tensorRange); \ - in_1 = in_1.random() + static_cast(0.01); \ - in_2 = in_2.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_1 = static_cast( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_2 = static_cast( \ - sycl_device.allocate(in_2.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_2(gpu_data_2, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - sycl_device.memcpyHostToDevice(gpu_data_2, in_2.data(), \ - (in_2.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR gpu_2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR in_2(i)); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_2); \ - sycl_device.deallocate(gpu_data_out); \ - } +#define DECLARE_BINARY_STRUCT(FUNC) \ + struct op_##FUNC { \ + template \ + auto operator()(const T1& x, const T2& y) -> decltype(cl::sycl::FUNC(x, y)) { \ + return cl::sycl::FUNC(x, y); \ + } \ + template \ + auto operator()(const TensorMap& x, const TensorMap& y) -> decltype(x.FUNC(y)) { \ + return x.FUNC(y); \ + } \ + }; -#define TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(SCALAR, OPERATOR, Layout) \ - { \ - /* out = in_1 OPERATOR 2 */ \ - Tensor in_1(tensorRange); \ - Tensor out(tensorRange); \ - in_1 = in_1.random() + static_cast(0.01); \ - Tensor reference(out); \ - SCALAR *gpu_data_1 = static_cast( \ - sycl_device.allocate(in_1.size() * sizeof(SCALAR))); \ - SCALAR *gpu_data_out = static_cast( \ - sycl_device.allocate(out.size() * sizeof(SCALAR))); \ - TensorMap> gpu_1(gpu_data_1, tensorRange); \ - TensorMap> gpu_out(gpu_data_out, tensorRange); \ - sycl_device.memcpyHostToDevice(gpu_data_1, in_1.data(), \ - (in_1.size()) * sizeof(SCALAR)); \ - gpu_out.device(sycl_device) = gpu_1 OPERATOR 2; \ - sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, \ - (out.size()) * sizeof(SCALAR)); \ - for (int64_t i = 0; i < out.size(); ++i) { \ - VERIFY_IS_APPROX(out(i), in_1(i) OPERATOR 2); \ - } \ - sycl_device.deallocate(gpu_data_1); \ - sycl_device.deallocate(gpu_data_out); \ - } +DECLARE_BINARY_STRUCT(cwiseMax) +DECLARE_BINARY_STRUCT(cwiseMin) -#define TEST_BINARY_BUILTINS(SCALAR, Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMax , Layout) \ - TEST_BINARY_BUILTINS_FUNC(SCALAR, cwiseMin , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, + , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, - , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, * , Layout) \ - TEST_BINARY_BUILTINS_OPERATORS(SCALAR, / , Layout) +#define DECLARE_BINARY_STRUCT_OP(NAME, OPERATOR) \ + struct op_##NAME { \ + template \ + auto operator()(const T1& x, const T2& y) -> decltype(x OPERATOR y) { \ + return x OPERATOR y; \ + } \ + }; + +DECLARE_BINARY_STRUCT_OP(plus, +) +DECLARE_BINARY_STRUCT_OP(minus, -) +DECLARE_BINARY_STRUCT_OP(times, *) +DECLARE_BINARY_STRUCT_OP(divide, /) +DECLARE_BINARY_STRUCT_OP(modulo, %) + +template +void test_binary_builtins(const Eigen::SyclDevice& sycl_device, + const array& tensor_range) { + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); + test_binary_builtins_func(sycl_device, tensor_range); +} + +template +static void test_floating_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { + int64_t sizeDim1 = 10; + int64_t sizeDim2 = 10; + int64_t sizeDim3 = 10; + array tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins(sycl_device, tensor_range); + test_binary_builtins(sycl_device, tensor_range); +} -static void test_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { +template +static void test_integer_builtin_binary_sycl(const Eigen::SyclDevice &sycl_device) { int64_t sizeDim1 = 10; int64_t sizeDim2 = 10; int64_t sizeDim3 = 10; - array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; - TEST_BINARY_BUILTINS(float, RowMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, RowMajor) - TEST_BINARY_BUILTINS(float, ColMajor) - TEST_BINARY_BUILTINS_OPERATORS_THAT_TAKES_SCALAR(int, %, ColMajor) + array tensor_range = {{sizeDim1, sizeDim2, sizeDim3}}; + test_binary_builtins_fixed_arg2(sycl_device, tensor_range); + test_binary_builtins_fixed_arg2(sycl_device, tensor_range); } EIGEN_DECLARE_TEST(cxx11_tensor_builtins_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { QueueInterface queueInterface(device); Eigen::SyclDevice sycl_device(&queueInterface); - CALL_SUBTEST(test_builtin_unary_sycl(sycl_device)); - CALL_SUBTEST(test_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_1(test_builtin_unary_sycl(sycl_device)); + CALL_SUBTEST_2(test_floating_builtin_binary_sycl(sycl_device)); + CALL_SUBTEST_3(test_integer_builtin_binary_sycl(sycl_device)); } } diff --git a/unsupported/test/cxx11_tensor_chipping_sycl.cpp b/unsupported/test/cxx11_tensor_chipping_sycl.cpp index a91efe00c..1e7093104 100644 --- a/unsupported/test/cxx11_tensor_chipping_sycl.cpp +++ b/unsupported/test/cxx11_tensor_chipping_sycl.cpp @@ -419,6 +419,7 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) const size_t tensorBuffSize =tensor.size()*sizeof(DataType); const size_t input2TensorBuffSize =input2.size()*sizeof(DataType); + std::cout << tensorBuffSize << " , "<< input2TensorBuffSize << std::endl; DataType* gpu_data_tensor = static_cast(sycl_device.allocate(tensorBuffSize)); DataType* gpu_data_input1 = static_cast(sycl_device.allocate(tensorBuffSize)); DataType* gpu_data_input2 = static_cast(sycl_device.allocate(input2TensorBuffSize)); @@ -605,14 +606,14 @@ static void test_chip_as_lvalue_sycl(const Eigen::SyclDevice& sycl_device) template void sycl_chipping_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_static_chip_sycl(sycl_device); + /* test_static_chip_sycl(sycl_device); test_static_chip_sycl(sycl_device); test_dynamic_chip_sycl(sycl_device); test_dynamic_chip_sycl(sycl_device); test_chip_in_expr(sycl_device); - test_chip_in_expr(sycl_device); + test_chip_in_expr(sycl_device);*/ test_chip_as_lvalue_sycl(sycl_device); - test_chip_as_lvalue_sycl(sycl_device); + // test_chip_as_lvalue_sycl(sycl_device); } EIGEN_DECLARE_TEST(cxx11_tensor_chipping_sycl) { diff --git a/unsupported/test/cxx11_tensor_contract_sycl.cpp b/unsupported/test/cxx11_tensor_contract_sycl.cpp index c8e86e69f..fbcc29358 100644 --- a/unsupported/test/cxx11_tensor_contract_sycl.cpp +++ b/unsupported/test/cxx11_tensor_contract_sycl.cpp @@ -17,23 +17,27 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL -#include +#include #include #include +#include #include "main.h" + #include using Eigen::array; using Eigen::SyclDevice; using Eigen::Tensor; using Eigen::TensorMap; -template -void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) -{ - typedef typename Tensor::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; -// std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; + +template +void static test_sycl_contraction(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU @@ -41,7 +45,6 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I Tensor t_right(k_size, n_size); Tensor t_result(m_size, n_size); Tensor t_result_gpu(m_size, n_size); -// Eigen::array dims(DimPair(1, 0)); Eigen::array dims = {{DimPair(1, 0)}}; Eigen::array left_dims = {{m_size, k_size}}; Eigen::array right_dims = {{k_size, n_size}}; @@ -50,117 +53,217 @@ void static test_sycl_contraction(const Device& sycl_device, IndexType m_size, I t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); std::size_t t_right_bytes = t_right.size() * sizeof(DataType); std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - DataType * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); + DataType *d_t_left = + static_cast(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast(sycl_device.allocate(t_result_bytes)); - Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap > gpu_t_result(d_t_result, result_dims); + Eigen::TensorMap> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap> + gpu_t_result(d_t_result, result_dims); - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); t_result = t_left.contract(t_right, dims); for (IndexType i = 0; i < t_result.size(); i++) { - if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + if (static_cast(std::fabs(static_cast( + t_result(i) - t_result_gpu(i)))) < error_threshold) { continue; } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { continue; } - std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); + + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } sycl_device.deallocate(d_t_left); sycl_device.deallocate(d_t_right); sycl_device.deallocate(d_t_result); } -template -void test_TF(const Device& sycl_device) -{ - typedef typename Tensor::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; - Eigen::array left_dims = {{2, 3}}; - Eigen::array right_dims = {{3, 1}}; - Eigen::array res_dims = {{2, 1}}; - Eigen::array dims = {{DimPair(1, 0)}}; +template +void test_sycl_contraction_m(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction(sycl_device, k, 128, + 128); + } +} +template +void test_sycl_contraction_k(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction(sycl_device, 128, k, + 128); + } +} - Tensor t_left(left_dims); - Tensor t_right(right_dims); - Tensor t_result_gpu(res_dims); - Tensor t_result(res_dims); +template +void test_sycl_contraction_n(const Device &sycl_device) { + for (IndexType k = 32; k < 256; k++) { + test_sycl_contraction(sycl_device, 128, + 128, k); + } +} - t_left.data()[0] = 1.0f; - t_left.data()[1] = 2.0f; - t_left.data()[2] = 3.0f; - t_left.data()[3] = 4.0f; - t_left.data()[4] = 5.0f; - t_left.data()[5] = 6.0f; +template +void test_sycl_contraction_sizes(const Device &sycl_device) { + IndexType m_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, + 257, 511, 512, 513, 1023, 1024, 1025}; - t_right.data()[0] = -1.0f; - t_right.data()[1] = 0.5f; - t_right.data()[2] = 2.0f; + IndexType n_sizes[] = {31, 39, 63, 64, 65, 127, 129, 255, + 257, 511, 512, 513, 1023, 1024, 1025}; - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); - std::size_t t_right_bytes = t_right.size() * sizeof(DataType); - std::size_t t_result_bytes = t_result.size()*sizeof(DataType); + IndexType k_sizes[] = {31, 39, 63, 64, 65, 95, 96, 127, 129, + 255, 257, 511, 512, 513, 1023, 1024, 1025}; + + for (IndexType i = 0; i < 15; i++) { + for (IndexType j = 0; j < 15; j++) { + for (IndexType k = 0; k < 17; k++) { + test_sycl_contraction( + sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); + } + } + } +} + +template +void static test_no_out_of_bounds(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Tensor t_left(m_size, k_size); + Tensor t_right(k_size, n_size); + Tensor t_result(m_size, n_size); + + Eigen::array dims = {{DimPair(1, 0)}}; + Eigen::array left_dims = {{m_size, k_size}}; + Eigen::array right_dims = {{k_size, n_size}}; + Eigen::array result_dims = {{m_size, n_size}}; + t_left.setRandom(); + t_right.setRandom(); - DataType * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); + // Allocate buffers twice as big to check for invalid read and write + auto padded_left_size = 2 * t_left.size(); + auto padded_right_size = 2 * t_right.size(); + auto padded_result_size = 2 * t_result.size(); - Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap > gpu_t_result(d_t_result, res_dims); + std::size_t t_left_bytes = padded_left_size * sizeof(DataType); + std::size_t t_right_bytes = padded_right_size * sizeof(DataType); + std::size_t t_result_bytes = padded_result_size * sizeof(DataType); - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + DataType *d_t_left = + static_cast(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast(sycl_device.allocate(t_result_bytes)); + + // TensorMaps are still of the same size than the Tensors + Eigen::TensorMap> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap> + gpu_t_result(d_t_result, result_dims); + + // Write nan after the actual buffer to propagate nans everywhere in case of + // invalid reads + DataType nan = std::numeric_limits::quiet_NaN(); + auto host_left_data = new DataType[padded_left_size]; + std::copy_n(t_left.data(), t_left.size(), host_left_data); + std::fill_n(host_left_data + t_left.size(), t_left.size(), nan); + auto host_right_data = new DataType[padded_right_size]; + std::copy_n(t_right.data(), t_right.size(), host_right_data); + std::fill_n(host_right_data + t_right.size(), t_right.size(), nan); + auto host_result_data = new DataType[padded_result_size]; + std::fill_n(host_result_data, padded_result_size, nan); + + sycl_device.memcpyHostToDevice(d_t_left, host_left_data, t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, host_right_data, t_right_bytes); + sycl_device.memcpyHostToDevice(d_t_result, host_result_data, t_result_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + sycl_device.memcpyDeviceToHost(host_result_data, d_t_result, t_result_bytes); t_result = t_left.contract(t_right, dims); for (IndexType i = 0; i < t_result.size(); i++) { - if (static_cast(fabs(t_result(i) - t_result_gpu(i))) < error_threshold) { + if (static_cast(std::fabs(static_cast( + t_result(i) - host_result_data[i]))) < error_threshold) { continue; } - if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), error_threshold)) { + if (Eigen::internal::isApprox(t_result(i), host_result_data[i], + error_threshold)) { continue; } - std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) - << " vs " << t_result_gpu(i) << std::endl; - assert(false); + if (std::isnan(host_result_data[i])) { + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", invalid read detected at IndexType " << i << ": " + << t_result(i) << " vs " << host_result_data[i] << std::endl; + } else { + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " + << t_result(i) << " vs " << host_result_data[i] << std::endl; + } + VERIFY_IS_APPROX(host_result_data[i], t_result(i)); + } + // Make sure that the rest of the result is still nans + for (IndexType i = t_result.size(); i < padded_result_size; i++) { + if (std::isnan(host_result_data[i])) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", invalid write detected at IndexType " << i << ": " + << host_result_data[i] << std::endl; + VERIFY_IS_APPROX(host_result_data[i], t_result(i)); } sycl_device.deallocate(d_t_left); sycl_device.deallocate(d_t_right); sycl_device.deallocate(d_t_result); - + delete[] host_left_data; + delete[] host_right_data; + delete[] host_result_data; } -template -void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, IndexType n_size) -{ - //std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << ")" << std::endl; +template +void test_scalar(const Device &sycl_device, IndexType m_size, IndexType k_size, + IndexType n_size) { + // std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size << + // ")" << std::endl; // with these dimensions, the output has 300 * 140 elements, which is // more than 30 * 1024, which is the number of threads in blocks on // a 15 SM GK110 GPU - typedef typename Tensor::DimensionPair DimPair; - static const DataType error_threshold =1e-4f; + typedef typename Tensor::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); Tensor t_left(m_size, k_size); Tensor t_right(k_size, n_size); Tensor t_result; @@ -171,32 +274,40 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, t_left.setRandom(); t_right.setRandom(); - std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); std::size_t t_right_bytes = t_right.size() * sizeof(DataType); std::size_t t_result_bytes = sizeof(DataType); + DataType *d_t_left = + static_cast(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast(sycl_device.allocate(t_result_bytes)); - DataType * d_t_left = static_cast(sycl_device.allocate(t_left_bytes)); - DataType * d_t_right = static_cast(sycl_device.allocate(t_right_bytes)); - DataType * d_t_result = static_cast(sycl_device.allocate(t_result_bytes)); + Eigen::TensorMap> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap> + gpu_t_result(d_t_result); - Eigen::TensorMap > gpu_t_left(d_t_left, left_dims); - Eigen::TensorMap > gpu_t_right(d_t_right, right_dims); - Eigen::TensorMap > gpu_t_result(d_t_result); - - sycl_device.memcpyHostToDevice(d_t_left, t_left.data(),t_left_bytes); - sycl_device.memcpyHostToDevice(d_t_right, t_right.data(),t_right_bytes); + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); - sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, t_result_bytes); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); t_result = t_left.contract(t_right, dims); - if (static_cast(fabs(t_result() - t_result_gpu())) > error_threshold && + if (static_cast(std::fabs(static_cast( + t_result() - t_result_gpu()))) > error_threshold && !Eigen::internal::isApprox(t_result(), t_result_gpu(), error_threshold)) { - std::cout << "mismatch detected: " << t_result() - << " vs " << t_result_gpu() << std::endl; - assert(false); + std::cout << "K: " << k_size << ", N: " << n_size << ", M: " << m_size + << " : mismatch detected: " << t_result() << " vs " + << t_result_gpu() << std::endl; + VERIFY_IS_APPROX(t_result_gpu(), t_result()); } sycl_device.deallocate(d_t_left); @@ -204,87 +315,712 @@ void test_scalar(const Device& sycl_device, IndexType m_size, IndexType k_size, sycl_device.deallocate(d_t_result); } +template +void contraction_batch(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size, IndexType m_batch, + IndexType start, IndexType limit) { + typedef typename Tensor::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + typedef Eigen::array TensorDim; + typedef Eigen::Tensor TensorType; + TensorDim left_dims = {{m_batch, k_size, m_size}}; + TensorDim right_dims = {{m_batch, n_size, k_size}}; + TensorDim res_dims = {{m_batch, m_size, n_size}}; + Eigen::array contract_pairs = {{DimPair(0, 1)}}; -template -void test_sycl_contraction_m(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction(sycl_device, k, 128, 128); + TensorType t_left(left_dims); + TensorType t_right(right_dims); + TensorType t_result_gpu(res_dims); + TensorType t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + for (int i = start; i < limit; ++i) { + auto x = gpu_t_left.template chip<0>(i); + auto y = gpu_t_right.template chip<0>(i); + auto z = gpu_t_result.template chip<0>(i); + z.device(sycl_device) = x.contract(y, contract_pairs); + } + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + for (int i = start; i < limit; ++i) { + auto x = t_left.template chip<0>(i); + auto y = t_right.template chip<0>(i); + auto z = t_result.template chip<0>(i); + z = x.contract(y, contract_pairs); + } + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast(std::fabs(static_cast( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); } -template -void test_sycl_contraction_k(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction(sycl_device, 128, k, 128); +template +void contraction_rhs_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array left_dims = {{m_size, k_size}}; + Eigen::array right_dims = {{n_size, k_size}}; + Eigen::array res_dims = {{m_size, n_size}}; + Eigen::array dims = {{DimPair(1, 1)}}; + + Tensor t_left(left_dims); + Tensor t_right(right_dims); + Tensor t_result_gpu(res_dims); + Tensor t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType j = 0; j < m_size; j++) { + for (IndexType i = 0; i < n_size; i++) { + if (static_cast(std::fabs(static_cast( + t_result(j, i) - t_result_gpu(j, i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(j, i), t_result_gpu(j, i), + error_threshold)) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType m: " << j << " n: " << i + << " CPU : " << t_result(j, i) + << " vs SYCL:" << t_result_gpu(j, i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(j, i), t_result(j, i)); + } } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); } -template -void test_sycl_contraction_n(const Device& sycl_device) { - for (IndexType k = 32; k < 256; k++) { - test_sycl_contraction(sycl_device, 128, 128, k); +template +void contraction_lhs_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array left_dims = {{k_size, m_size}}; + Eigen::array right_dims = {{k_size, n_size}}; + Eigen::array res_dims = {{m_size, n_size}}; + Eigen::array dims = {{DimPair(0, 0)}}; + + Tensor t_left(left_dims); + Tensor t_right(right_dims); + Tensor t_result_gpu(res_dims); + Tensor t_result(res_dims); + + t_left.setRandom(); + t_right.setRandom(); + + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType *d_t_left = + static_cast(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast(std::fabs(static_cast( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); } +template +void contraction_both_transposed(const Device &sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size) { + typedef typename Tensor::DimensionPair + DimPair; + static const DataType error_threshold = DataType(1e-4); + Eigen::array left_dims = {{k_size, m_size}}; + Eigen::array right_dims = {{n_size, k_size}}; + Eigen::array res_dims = {{m_size, n_size}}; + Eigen::array dims = {{DimPair(0, 1)}}; -template -void test_sycl_contraction_sizes(const Device& sycl_device) { - IndexType m_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257 , 511, - 512, 513, 1023, 1024, 1025}; + Tensor t_left(left_dims); + Tensor t_right(right_dims); + Tensor t_result_gpu(res_dims); + Tensor t_result(res_dims); - IndexType n_sizes[] = { 31, 39, 63, 64, 65, - 127, 129, 255, 257, 511, - 512, 513, 1023, 1024, 1025}; + t_left.setRandom(); + t_right.setRandom(); - IndexType k_sizes[] = { 31, 39, 63, 64, 65, - 95, 96, 127, 129, 255, - 257, 511, 512, 513, 1023, - 1024, 1025}; + std::size_t t_left_bytes = t_left.size() * sizeof(DataType); + std::size_t t_right_bytes = t_right.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); - for (IndexType i = 0; i < 15; i++) { - for (IndexType j = 0; j < 15; j++) { - for (IndexType k = 0; k < 17; k++) { - test_sycl_contraction(sycl_device, m_sizes[i], n_sizes[j], k_sizes[k]); - } + DataType *d_t_left = + static_cast(sycl_device.allocate(t_left_bytes)); + DataType *d_t_right = + static_cast(sycl_device.allocate(t_right_bytes)); + DataType *d_t_result = + static_cast(sycl_device.allocate(t_result_bytes)); + + Eigen::TensorMap> + gpu_t_left(d_t_left, left_dims); + Eigen::TensorMap> + gpu_t_right(d_t_right, right_dims); + Eigen::TensorMap> + gpu_t_result(d_t_result, res_dims); + + sycl_device.memcpyHostToDevice(d_t_left, t_left.data(), t_left_bytes); + sycl_device.memcpyHostToDevice(d_t_right, t_right.data(), t_right_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_left.contract(gpu_t_right, dims); + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), d_t_result, + t_result_bytes); + + t_result = t_left.contract(t_right, dims); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast(std::fabs(static_cast( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; } + std::cout << "M : " << m_size << ", N : " << n_size << ", K : " << k_size + << ", mismatch detected at IndexType " << i << ": " << t_result(i) + << " vs " << t_result_gpu(i) << std::endl; + + VERIFY_IS_APPROX(t_result_gpu(i), t_result(i)); } + sycl_device.deallocate(d_t_left); + sycl_device.deallocate(d_t_right); + sycl_device.deallocate(d_t_result); +} + +template +void inline tensorOutofBound(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Test out of bound for Tensor-Tensor + test_no_out_of_bounds(sycl_device, 10, 1024, + 1024); + test_no_out_of_bounds(sycl_device, 1024, 1024, + 4096); + test_no_out_of_bounds(sycl_device, 4096, 1024, + 2048); + test_no_out_of_bounds(sycl_device, 784, 2048, + 1024); + test_no_out_of_bounds(sycl_device, 2048, 1024, + 784); + test_no_out_of_bounds(sycl_device, 10, 1024, + 10); + test_no_out_of_bounds(sycl_device, 513, 4096, + 513); + test_no_out_of_bounds(sycl_device, 783, 1024, + 783); + test_no_out_of_bounds(sycl_device, 784, 2048, + 784); + test_no_out_of_bounds(sycl_device, 11, 1024, + 11); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor out of bound tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensorTensor(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction(sycl_device, 128, 128, + 128); + test_sycl_contraction(sycl_device, 128, 128, + 128); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensorTensor_m(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_m(sycl_device); + test_sycl_contraction_m(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensorTensor_n(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_n(sycl_device); + test_sycl_contraction_n(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; } -template void tensorContractionPerDevice(Dev_selector& s){ - QueueInterface queueInterface(s); - auto sycl_device=Eigen::SyclDevice(&queueInterface); - test_sycl_contraction(sycl_device, 32, 32, 32); - test_sycl_contraction(sycl_device, 32, 32, 32); - test_scalar(sycl_device, 32, 32, 32); - test_scalar(sycl_device, 32, 32, 32); +template +void inline tensorTensor_k(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; std::chrono::time_point start, end; start = std::chrono::system_clock::now(); - test_sycl_contraction(sycl_device, 128, 128, 128); - test_sycl_contraction(sycl_device, 128, 128, 128); - test_scalar(sycl_device, 128, 128, 128); - test_scalar(sycl_device, 128, 128, 128); - test_sycl_contraction_m(sycl_device); - test_sycl_contraction_m(sycl_device); - test_sycl_contraction_n(sycl_device); - test_sycl_contraction_n(sycl_device); - test_sycl_contraction_k(sycl_device); - test_sycl_contraction_k(sycl_device); - test_sycl_contraction_sizes(sycl_device); - test_sycl_contraction_sizes(sycl_device); - test_TF(sycl_device); - test_TF(sycl_device); + test_sycl_contraction_k(sycl_device); + test_sycl_contraction_k(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensorTensor_sizes(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction_sizes(sycl_device); + test_sycl_contraction_sizes(sycl_device); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "tensor tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} +template +void inline vectorVector(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // VECTOR-VECTOR + test_sycl_contraction(sycl_device, 1025, 1, + 1025); + test_sycl_contraction(sycl_device, 1025, 1, + 1025); + test_sycl_contraction(sycl_device, 1024, 1, + 1024); + test_sycl_contraction(sycl_device, 1024, 1, + 1024); + test_sycl_contraction(sycl_device, 1023, 1, + 1023); + test_sycl_contraction(sycl_device, 1023, 1, + 1023); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "contracted tensor tests finished computation at " + << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline vectorTensor(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Vector-Tensor + test_sycl_contraction(sycl_device, 1, 1025, + 1025); + test_sycl_contraction(sycl_device, 1, 1025, + 1025); + test_sycl_contraction(sycl_device, 1, 1024, + 1024); + test_sycl_contraction(sycl_device, 1, 1024, + 1024); + test_sycl_contraction(sycl_device, 1, 1023, + 1023); + test_sycl_contraction(sycl_device, 1, 1023, + 1023); + + test_sycl_contraction(sycl_device, 1, 4097, + 4097); + test_sycl_contraction(sycl_device, 1, 4097, + 4097); + test_sycl_contraction(sycl_device, 1, 4096, + 4096); + test_sycl_contraction(sycl_device, 1, 4096, + 4096); + test_sycl_contraction(sycl_device, 1, 4095, + 4095); + test_sycl_contraction(sycl_device, 1, 4095, + 4095); + test_sycl_contraction(sycl_device, 1, 802816, + 32); end = std::chrono::system_clock::now(); - std::chrono::duration elapsed_seconds = end-start; + std::chrono::duration elapsed_seconds = end - start; std::time_t end_time = std::chrono::system_clock::to_time_t(end); std::cout << "finished computation at " << std::ctime(&end_time) << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensorVector(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Matrix-Vector + test_sycl_contraction(sycl_device, 1025, 1025, + 1); + test_sycl_contraction(sycl_device, 1125, 1025, + 1); + test_sycl_contraction(sycl_device, 1224, 1024, + 1); + test_sycl_contraction(sycl_device, 1024, 1024, + 1); + test_sycl_contraction(sycl_device, 1023, 1023, + 1); + test_sycl_contraction(sycl_device, 1023, 1023, + 1); + test_sycl_contraction(sycl_device, 4097, 4197, + 1); + test_sycl_contraction(sycl_device, 4097, 4097, + 1); + test_sycl_contraction(sycl_device, 4096, 4096, + 1); + test_sycl_contraction(sycl_device, 4096, 8196, + 1); + test_sycl_contraction(sycl_device, 4095, 4095, + 1); + test_sycl_contraction(sycl_device, 4095, 4095, + 1); +// If the GEMV disabled it will creates one kernel to calculate the contraction. +// Therefore the acumuation of float number will overflow the precision +// threshold for float and cause the test to fail. While it the GMV multiple +// kernel will be created and each one run the overflow of accumutation breaks +// among the kernels. +#ifndef EIGEN_SYCL_DISABLE_GEMV + test_sycl_contraction(sycl_device, 32, 802032, + 1); +#endif + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensorScalar(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // SCALAR Contraction + test_scalar(sycl_device, 127, 127, 127); + test_scalar(sycl_device, 127, 127, 127); + test_scalar(sycl_device, 128, 128, 128); + test_scalar(sycl_device, 128, 128, 128); + test_scalar(sycl_device, 129, 129, 129); + test_scalar(sycl_device, 129, 129, 129); + + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline skinnyTensor_row(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction(sycl_device, 16, 4, 16); + test_sycl_contraction(sycl_device, 257, 131073, + 257); + test_sycl_contraction(sycl_device, 256, 131072, + 256); + test_sycl_contraction(sycl_device, 16, 131073, + 16); + test_sycl_contraction(sycl_device, 17, 131072, + 17); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline skinnyTensor_col(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + // Tensor Tensor Contraction + test_sycl_contraction(sycl_device, 16, 4, 16); + test_sycl_contraction(sycl_device, 257, 131073, + 257); + test_sycl_contraction(sycl_device, 256, 131072, + 256); + test_sycl_contraction(sycl_device, 16, 131073, + 16); + test_sycl_contraction(sycl_device, 17, 131072, + 17); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} +template +void inline tensor_contraction_batch_per_device(const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + contraction_batch(sycl_device, 64, 75, 30, 4, + 0, 4); + contraction_batch(sycl_device, 64, 75, 30, 4, + 0, 4); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensor_contraction_lhs_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + contraction_lhs_transposed(sycl_device, 8, 4, + 8); + contraction_lhs_transposed(sycl_device, 32, 8, + 32); + contraction_lhs_transposed(sycl_device, 64, 16, + 64); + contraction_lhs_transposed(sycl_device, 784, + 2048, 1024); + contraction_lhs_transposed(sycl_device, 1024, + 10, 1024); + contraction_lhs_transposed(sycl_device, 4096, + 1024, 1024); + contraction_lhs_transposed(sycl_device, 2048, + 4096, 1024); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensor_contraction_rhs_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + contraction_rhs_transposed(sycl_device, 16, 4, + 16); + contraction_rhs_transposed(sycl_device, 17, 5, + 17); + contraction_rhs_transposed(sycl_device, 32, 8, + 32); + contraction_rhs_transposed(sycl_device, 64, 16, + 64); + contraction_rhs_transposed(sycl_device, 10, + 1024, 1024); + contraction_rhs_transposed(sycl_device, 1024, + 1024, 4096); + contraction_rhs_transposed(sycl_device, 4096, + 1024, 2048); + contraction_rhs_transposed(sycl_device, 2048, + 1024, 784); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; +} + +template +void inline tensor_contraction_both_transposed_per_device( + const Dev &sycl_device) { + typedef float DataType; + typedef int64_t IndexType; + std::chrono::time_point start, end; + start = std::chrono::system_clock::now(); + + contraction_both_transposed(sycl_device, 17, 5, + 17); + contraction_both_transposed(sycl_device, 32, 8, + 32); + contraction_both_transposed(sycl_device, 64, + 16, 64); + end = std::chrono::system_clock::now(); + std::chrono::duration elapsed_seconds = end - start; + std::time_t end_time = std::chrono::system_clock::to_time_t(end); + std::cout << "finished computation at " << std::ctime(&end_time) + << "elapsed time: " << elapsed_seconds.count() << "s\n"; } EIGEN_DECLARE_TEST(cxx11_tensor_contract_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(tensorContractionPerDevice(device)); + for (const auto &device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(tensorOutofBound(sycl_device)); + CALL_SUBTEST_2(tensorTensor(sycl_device)); + CALL_SUBTEST_2(tensorTensor_m(sycl_device)); + CALL_SUBTEST_2(tensorTensor_n(sycl_device)); + CALL_SUBTEST_2(tensorTensor_k(sycl_device)); + CALL_SUBTEST_2(tensorTensor_sizes(sycl_device)); + CALL_SUBTEST_3(vectorVector(sycl_device)); + CALL_SUBTEST_4(vectorTensor(sycl_device)); + CALL_SUBTEST_5(tensorVector(sycl_device)); + CALL_SUBTEST_6(tensorScalar(sycl_device)); + CALL_SUBTEST_7(skinnyTensor_row(sycl_device)); + CALL_SUBTEST_7(skinnyTensor_col(sycl_device)); + CALL_SUBTEST_8(tensor_contraction_batch_per_device(sycl_device)); + CALL_SUBTEST_9(tensor_contraction_lhs_transposed_per_device(sycl_device)); + CALL_SUBTEST_10(tensor_contraction_rhs_transposed_per_device(sycl_device)); + CALL_SUBTEST_11(tensor_contraction_both_transposed_per_device(sycl_device)); } } diff --git a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp index cc3b02448..d947ead83 100644 --- a/unsupported/test/cxx11_tensor_custom_op_sycl.cpp +++ b/unsupported/test/cxx11_tensor_custom_op_sycl.cpp @@ -80,6 +80,8 @@ static void test_custom_unary_op_sycl(const Eigen::SyclDevice &sycl_device) VERIFY_IS_EQUAL(out(i, j), 0); } } + sycl_device.deallocate(gpu_in1_data); +sycl_device.deallocate(gpu_out_data); } template @@ -147,6 +149,9 @@ static void test_custom_binary_op_sycl(const Eigen::SyclDevice &sycl_device) } } } + sycl_device.deallocate(gpu_in1_data); + sycl_device.deallocate(gpu_in2_data); + sycl_device.deallocate(gpu_out_data); } template void custom_op_perDevice(Dev_selector s){ diff --git a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp index 74d38a644..a55a5ad8a 100644 --- a/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp +++ b/unsupported/test/cxx11_tensor_forced_eval_sycl.cpp @@ -36,8 +36,8 @@ void test_forced_eval_sycl(const Eigen::SyclDevice &sycl_device) { DataType * gpu_in2_data = static_cast(sycl_device.allocate(in2.dimensions().TotalSize()*sizeof(DataType))); DataType * gpu_out_data = static_cast(sycl_device.allocate(out.dimensions().TotalSize()*sizeof(DataType))); - in1 = in1.random() + in1.constant(10.0f); - in2 = in2.random() + in2.constant(10.0f); + in1 = in1.random() + in1.constant(static_cast(10.0f)); + in2 = in2.random() + in2.constant(static_cast(10.0f)); // creating TensorMap from tensor Eigen::TensorMap> gpu_in1(gpu_in1_data, tensorRange); @@ -72,5 +72,6 @@ template void tensorForced_evalperDev EIGEN_DECLARE_TEST(cxx11_tensor_forced_eval_sycl) { for (const auto& device :Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(tensorForced_evalperDevice(device)); + CALL_SUBTEST(tensorForced_evalperDevice(device)); } } diff --git a/unsupported/test/cxx11_tensor_image_op_sycl.cpp b/unsupported/test/cxx11_tensor_image_op_sycl.cpp new file mode 100644 index 000000000..db1c0206e --- /dev/null +++ b/unsupported/test/cxx11_tensor_image_op_sycl.cpp @@ -0,0 +1,103 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +using Eigen::Tensor; +using Eigen::RowMajor; +template +static void test_image_op_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 245; + IndexType sizeDim2 = 343; + IndexType sizeDim3 = 577; + + array input_range ={{sizeDim1, sizeDim2, sizeDim3}}; + array slice_range ={{sizeDim1-1, sizeDim2, sizeDim3}}; + + Tensor tensor1(input_range); + Tensor tensor2(input_range); + Tensor tensor3(slice_range); + Tensor tensor3_cpu(slice_range); + + + + typedef Eigen::DSizes Index3; + Index3 strides1(1L,1L, 1L); + Index3 indicesStart1(1L, 0L, 0L); + Index3 indicesStop1(sizeDim1, sizeDim2, sizeDim3); + + Index3 strides2(1L,1L, 1L); + Index3 indicesStart2(0L, 0L, 0L); + Index3 indicesStop2(sizeDim1-1, sizeDim2, sizeDim3); + Eigen::DSizes sizes(sizeDim1-1,sizeDim2,sizeDim3); + + tensor1.setRandom(); + tensor2.setRandom(); + + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor1.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(tensor2.size()*sizeof(DataType))); + DataType* gpu_data3 = static_cast(sycl_device.allocate(tensor3.size()*sizeof(DataType))); + + TensorMap> gpu1(gpu_data1, input_range); + TensorMap> gpu2(gpu_data2, input_range); + TensorMap> gpu3(gpu_data3, slice_range); + + sycl_device.memcpyHostToDevice(gpu_data1, tensor1.data(),(tensor1.size())*sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data2, tensor2.data(),(tensor2.size())*sizeof(DataType)); + gpu3.device(sycl_device)= gpu1.slice(indicesStart1, sizes) - gpu2.slice(indicesStart2, sizes); + sycl_device.memcpyDeviceToHost(tensor3.data(), gpu_data3,(tensor3.size())*sizeof(DataType)); + + tensor3_cpu = tensor1.stridedSlice(indicesStart1,indicesStop1,strides1) - tensor2.stridedSlice(indicesStart2,indicesStop2,strides2); + + + for (IndexType i = 0; i void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_image_op_sycl(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_image_op_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST(sycl_computing_test_per_device(device)); +#endif + } +} diff --git a/unsupported/test/cxx11_tensor_math_sycl.cpp b/unsupported/test/cxx11_tensor_math_sycl.cpp new file mode 100644 index 000000000..029653e27 --- /dev/null +++ b/unsupported/test/cxx11_tensor_math_sycl.cpp @@ -0,0 +1,105 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// Benoit Steiner +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::array; +using Eigen::SyclDevice; +using Eigen::Tensor; +using Eigen::TensorMap; + +using Eigen::Tensor; +using Eigen::RowMajor; +template +static void test_tanh_sycl(const Eigen::SyclDevice &sycl_device) +{ + + IndexType sizeDim1 = 4; + IndexType sizeDim2 = 4; + IndexType sizeDim3 = 1; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor in(tensorRange); + Tensor out(tensorRange); + Tensor out_cpu(tensorRange); + + in = in.random(); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(in.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(out.size()*sizeof(DataType))); + + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType)); + gpu2.device(sycl_device) = gpu1.tanh(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType)); + + out_cpu=in.tanh(); + + for (int i = 0; i < in.size(); ++i) { + VERIFY_IS_APPROX(out(i), out_cpu(i)); + } +} +template +static void test_sigmoid_sycl(const Eigen::SyclDevice &sycl_device) +{ + + IndexType sizeDim1 = 4; + IndexType sizeDim2 = 4; + IndexType sizeDim3 = 1; + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; + Tensor in(tensorRange); + Tensor out(tensorRange); + Tensor out_cpu(tensorRange); + + in = in.random(); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(in.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(out.size()*sizeof(DataType))); + + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, tensorRange); + + sycl_device.memcpyHostToDevice(gpu_data1, in.data(),(in.size())*sizeof(DataType)); + gpu2.device(sycl_device) = gpu1.sigmoid(); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data2,(out.size())*sizeof(DataType)); + + out_cpu=in.sigmoid(); + + for (int i = 0; i < in.size(); ++i) { + VERIFY_IS_APPROX(out(i), out_cpu(i)); + } +} + + +template void sycl_computing_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_tanh_sycl(sycl_device); + test_tanh_sycl(sycl_device); + test_sigmoid_sycl(sycl_device); + test_sigmoid_sycl(sycl_device); +} + +EIGEN_DECLARE_TEST(cxx11_tensor_math_sycl) { + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_computing_test_per_device(device)); + } +} diff --git a/unsupported/test/cxx11_tensor_morphing_sycl.cpp b/unsupported/test/cxx11_tensor_morphing_sycl.cpp index 93dabe3ec..bf001b40f 100644 --- a/unsupported/test/cxx11_tensor_morphing_sycl.cpp +++ b/unsupported/test/cxx11_tensor_morphing_sycl.cpp @@ -180,6 +180,82 @@ static void test_simple_slice(const Eigen::SyclDevice &sycl_device) sycl_device.deallocate(gpu_data3); } + +template +static void test_strided_slice_as_rhs_sycl(const Eigen::SyclDevice &sycl_device) +{ + IndexType sizeDim1 = 2; + IndexType sizeDim2 = 3; + IndexType sizeDim3 = 5; + IndexType sizeDim4 = 7; + IndexType sizeDim5 = 11; + typedef Eigen::DSizes Index5; + Index5 strides(1L,1L,1L,1L,1L); + Index5 indicesStart(1L,2L,3L,4L,5L); + Index5 indicesStop(2L,3L,4L,5L,6L); + Index5 lengths(1L,1L,1L,1L,1L); + + array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4, sizeDim5}}; + Tensor tensor(tensorRange); + tensor.setRandom(); + + array slice1_range ={{1, 1, 1, 1, 1}}; + Tensor slice1(slice1_range); + Tensor slice_stride1(slice1_range); + + DataType* gpu_data1 = static_cast(sycl_device.allocate(tensor.size()*sizeof(DataType))); + DataType* gpu_data2 = static_cast(sycl_device.allocate(slice1.size()*sizeof(DataType))); + DataType* gpu_data_stride2 = static_cast(sycl_device.allocate(slice_stride1.size()*sizeof(DataType))); + + TensorMap> gpu1(gpu_data1, tensorRange); + TensorMap> gpu2(gpu_data2, slice1_range); + TensorMap> gpu_stride2(gpu_data_stride2, slice1_range); + + Eigen::DSizes indices(1,2,3,4,5); + Eigen::DSizes sizes(1,1,1,1,1); + sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(),(tensor.size())*sizeof(DataType)); + gpu2.device(sycl_device)=gpu1.slice(indices, sizes); + sycl_device.memcpyDeviceToHost(slice1.data(), gpu_data2,(slice1.size())*sizeof(DataType)); + + gpu_stride2.device(sycl_device)=gpu1.stridedSlice(indicesStart,indicesStop,strides); + sycl_device.memcpyDeviceToHost(slice_stride1.data(), gpu_data_stride2,(slice_stride1.size())*sizeof(DataType)); + + VERIFY_IS_EQUAL(slice1(0,0,0,0,0), tensor(1,2,3,4,5)); + VERIFY_IS_EQUAL(slice_stride1(0,0,0,0,0), tensor(1,2,3,4,5)); + + array slice2_range ={{1,1,2,2,3}}; + Tensor slice2(slice2_range); + Tensor strideSlice2(slice2_range); + + DataType* gpu_data3 = static_cast(sycl_device.allocate(slice2.size()*sizeof(DataType))); + DataType* gpu_data_stride3 = static_cast(sycl_device.allocate(strideSlice2.size()*sizeof(DataType))); + TensorMap> gpu3(gpu_data3, slice2_range); + TensorMap> gpu_stride3(gpu_data_stride3, slice2_range); + Eigen::DSizes indices2(1,1,3,4,5); + Eigen::DSizes sizes2(1,1,2,2,3); + Index5 strides2(1L,1L,1L,1L,1L); + Index5 indicesStart2(1L,1L,3L,4L,5L); + Index5 indicesStop2(2L,2L,5L,6L,8L); + + gpu3.device(sycl_device)=gpu1.slice(indices2, sizes2); + sycl_device.memcpyDeviceToHost(slice2.data(), gpu_data3,(slice2.size())*sizeof(DataType)); + + gpu_stride3.device(sycl_device)=gpu1.stridedSlice(indicesStart2,indicesStop2,strides2); + sycl_device.memcpyDeviceToHost(strideSlice2.data(), gpu_data_stride3,(strideSlice2.size())*sizeof(DataType)); + + for (IndexType i = 0; i < 2; ++i) { + for (IndexType j = 0; j < 2; ++j) { + for (IndexType k = 0; k < 3; ++k) { + VERIFY_IS_EQUAL(slice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + VERIFY_IS_EQUAL(strideSlice2(0,0,i,j,k), tensor(1,1,3+i,4+j,5+k)); + } + } + } + sycl_device.deallocate(gpu_data1); + sycl_device.deallocate(gpu_data2); + sycl_device.deallocate(gpu_data3); +} + template static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) { @@ -228,6 +304,65 @@ static void test_strided_slice_write_sycl(const Eigen::SyclDevice& sycl_device) sycl_device.deallocate(gpu_data3); } +template +Eigen::array To32BitDims(const DSizes& in) { + Eigen::array out; + for (int i = 0; i < DSizes::count; ++i) { + out[i] = in[i]; + } + return out; +} + +template +int run_eigen(const SyclDevice& sycl_device) { + using TensorI64 = Tensor; + using TensorI32 = Tensor; + using TensorMI64 = TensorMap; + using TensorMI32 = TensorMap; + Eigen::array tensor_range{{4, 1, 1, 1, 6}}; + Eigen::array slice_range{{4, 1, 1, 1, 3}}; + + TensorI64 out_tensor_gpu(tensor_range); + TensorI64 out_tensor_cpu(tensor_range); + out_tensor_cpu.setRandom(); + + TensorI64 sub_tensor(slice_range); + sub_tensor.setRandom(); + + DataType* out_gpu_data = static_cast(sycl_device.allocate(out_tensor_cpu.size() * sizeof(DataType))); + DataType* sub_gpu_data = static_cast(sycl_device.allocate(sub_tensor.size() * sizeof(DataType))); + TensorMI64 out_gpu(out_gpu_data, tensor_range); + TensorMI64 sub_gpu(sub_gpu_data, slice_range); + + sycl_device.memcpyHostToDevice(out_gpu_data, out_tensor_cpu.data(), out_tensor_cpu.size() * sizeof(DataType)); + sycl_device.memcpyHostToDevice(sub_gpu_data, sub_tensor.data(), sub_tensor.size() * sizeof(DataType)); + + Eigen::array slice_offset_32{{0, 0, 0, 0, 3}}; + Eigen::array slice_range_32{{4, 1, 1, 1, 3}}; + TensorMI32 out_cpu_32(out_tensor_cpu.data(), To32BitDims(out_tensor_cpu.dimensions())); + TensorMI32 sub_cpu_32(sub_tensor.data(), To32BitDims(sub_tensor.dimensions())); + TensorMI32 out_gpu_32(out_gpu.data(), To32BitDims(out_gpu.dimensions())); + TensorMI32 sub_gpu_32(sub_gpu.data(), To32BitDims(sub_gpu.dimensions())); + + out_gpu_32.slice(slice_offset_32, slice_range_32).device(sycl_device) = sub_gpu_32; + + out_cpu_32.slice(slice_offset_32, slice_range_32) = sub_cpu_32; + + sycl_device.memcpyDeviceToHost(out_tensor_gpu.data(), out_gpu_data, out_tensor_cpu.size() * sizeof(DataType)); + int has_err = 0; + for (IndexType i = 0; i < out_tensor_cpu.size(); ++i) { + auto exp = out_tensor_cpu(i); + auto val = out_tensor_gpu(i); + if (val != exp) { + std::cout << "#" << i << " got " << val << " but expected " << exp << std::endl; + has_err = 1; + } + } + sycl_device.deallocate(out_gpu_data); + sycl_device.deallocate(sub_gpu_data); + return has_err; +} + template void sycl_morphing_test_per_device(dev_Selector s){ QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); @@ -239,6 +374,9 @@ template void sycl_morphing_test_per_d test_reshape_as_lvalue(sycl_device); test_strided_slice_write_sycl(sycl_device); test_strided_slice_write_sycl(sycl_device); + test_strided_slice_as_rhs_sycl(sycl_device); + test_strided_slice_as_rhs_sycl(sycl_device); + run_eigen(sycl_device); } EIGEN_DECLARE_TEST(cxx11_tensor_morphing_sycl) { diff --git a/unsupported/test/cxx11_tensor_random_sycl.cpp b/unsupported/test/cxx11_tensor_random_sycl.cpp new file mode 100644 index 000000000..6c83894a3 --- /dev/null +++ b/unsupported/test/cxx11_tensor_random_sycl.cpp @@ -0,0 +1,100 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +template +static void test_sycl_random_uniform(const Eigen::SyclDevice& sycl_device) +{ + Tensor out(72,97); + out.setZero(); + + std::size_t out_bytes = out.size() * sizeof(DataType); + + IndexType sizeDim0 = 72; + IndexType sizeDim1 = 97; + + array tensorRange = {{sizeDim0, sizeDim1}}; + + DataType* d_out = static_cast(sycl_device.allocate(out_bytes)); + TensorMap> gpu_out(d_out, tensorRange); + + gpu_out.device(sycl_device)=gpu_out.random(); + sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes); + for(IndexType i=1; i +void test_sycl_random_normal(const Eigen::SyclDevice& sycl_device) +{ + Tensor out(72,97); + out.setZero(); + std::size_t out_bytes = out.size() * sizeof(DataType); + + IndexType sizeDim0 = 72; + IndexType sizeDim1 = 97; + + array tensorRange = {{sizeDim0, sizeDim1}}; + + DataType* d_out = static_cast(sycl_device.allocate(out_bytes)); + TensorMap> gpu_out(d_out, tensorRange); + Eigen::internal::NormalRandomGenerator gen(true); + gpu_out.device(sycl_device)=gpu_out.random(gen); + sycl_device.memcpyDeviceToHost(out.data(), d_out,out_bytes); + for(IndexType i=1; i void sycl_random_test_per_device(dev_Selector s){ + QueueInterface queueInterface(s); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + test_sycl_random_uniform(sycl_device); + test_sycl_random_uniform(sycl_device); + test_sycl_random_normal(sycl_device); + test_sycl_random_normal(sycl_device); + +} +EIGEN_DECLARE_TEST(cxx11_tensor_random_sycl) +{ + for (const auto& device :Eigen::get_sycl_supported_devices()) { + CALL_SUBTEST(sycl_random_test_per_device(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST(sycl_random_test_per_device(device)); +#endif + } +} diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index f526299c6..a297716e4 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -16,16 +16,99 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL +#define EIGEN_HAS_CONSTEXPR 1 #include "main.h" + #include +template +static void test_full_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 753; + const IndexType num_cols = 537; + array tensorRange = {{num_rows, num_cols}}; + + array outRange = {{1, 1}}; + + Tensor in(tensorRange); + Tensor full_redux(outRange); + Tensor full_redux_gpu(outRange); + + in.setRandom(); + auto dim = DSizes(1, 1); + full_redux = in.sum().reshape(dim); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate( + sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu(gpu_out_data, + outRange); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); + sycl_device.memcpyDeviceToHost( + full_redux_gpu.data(), gpu_out_data, + (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) + << ", CPU FULL: " << full_redux(0, 0) << "\n"; + VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template -static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_sum_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.sum(); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - const IndexType num_rows = 452; - const IndexType num_cols = 765; + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_full_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; array tensorRange = {{num_rows, num_cols}}; Tensor in(tensorRange); @@ -34,27 +117,250 @@ static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device in.setRandom(); - full_redux = in.mean(); + full_redux = in.maximum(); - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_full_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the max. + // As we don't include this in the reduction the result should not be 2. + in(0) = static_cast(2); + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.maximum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.mean(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } +template +static void test_full_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; + array tensorRange = {{num_rows, num_cols}}; + array argRange = {{num_cols}}; + Eigen::array red_axis; + red_axis[0] = 0; + // red_axis[1]=1; + Tensor in(tensorRange); + Tensor in_arg1(tensorRange); + Tensor in_arg2(tensorRange); + Tensor out_arg_cpu(argRange); + Tensor out_arg_gpu(argRange); + Tensor out_arg_gpu_helper(argRange); + Tensor full_redux; + Tensor full_redux_gpu; + + in.setRandom(); + in_arg1.setRandom(); + in_arg2.setRandom(); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg1_data = static_cast(sycl_device.allocate( + in_arg1.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg2_data = static_cast(sycl_device.allocate( + in_arg2.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg__gpu_helper_data = static_cast(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg_data = static_cast(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> in_Arg1_gpu( + gpu_in_arg1_data, tensorRange); + TensorMap> in_Arg2_gpu( + gpu_in_arg2_data, tensorRange); + TensorMap> out_Argout_gpu( + gpu_out_arg_data, argRange); + TensorMap> out_Argout_gpu_helper( + gpu_out_arg__gpu_helper_data, argRange); + TensorMap> out_gpu(gpu_out_data); + + // CPU VERSION + out_arg_cpu = + (in_arg1.argmax(1) == in_arg2.argmax(1)) + .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); + full_redux = (out_arg_cpu.template cast()) + .reduce(red_axis, Eigen::internal::MeanReducer()); + + // GPU VERSION + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg1_data, in_arg1.data(), + (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg2_data, in_arg2.data(), + (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); + out_Argout_gpu_helper.device(sycl_device) = + (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); + out_Argout_gpu.device(sycl_device) = + (out_Argout_gpu_helper) + .select(out_Argout_gpu.constant(true), + out_Argout_gpu.constant(false)); + out_gpu.device(sycl_device) = + (out_Argout_gpu.template cast()) + .reduce(red_axis, Eigen::internal::MeanReducer()); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() + << '\n'; + VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_in_arg1_data); + sycl_device.deallocate(gpu_in_arg2_data); + sycl_device.deallocate(gpu_out_arg__gpu_helper_data); + sycl_device.deallocate(gpu_out_arg_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_full_reductions_mean_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.mean(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template -static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_mean_with_odd_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + // This is a particular case which illustrates a possible problem when the + // number of local threads in a workgroup is even, but is not a power of two. + using data_tensor = Tensor; + using scalar_tensor = Tensor; + // 2177 = (17 * 128) + 1 gives rise to 18 local threads. + // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. + const IndexType n_elems = 8707; + array tensor_range = {{n_elems}}; + + data_tensor in(tensor_range); + DataType full_redux; + DataType full_redux_gpu; + TensorMap red_cpu(&full_redux); + TensorMap red_gpu(&full_redux_gpu); + + const DataType const_val = static_cast(0.6391); + in = in.constant(const_val); + + Eigen::IndexList> red_axis; + red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer()); + VERIFY_IS_APPROX(const_val, red_cpu()); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data, tensor_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = + in_gpu.reduce(red_axis, Eigen::internal::MeanReducer()); + sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu, full_redux); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template +static void test_full_reductions_min_sycl( + const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 876; const IndexType num_cols = 953; array tensorRange = {{num_rows, num_cols}}; @@ -67,25 +373,73 @@ static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) full_redux = in.minimum(); - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu(gpu_out_data); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } - template -static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_min_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the min. + // As we don't include this in the reduction the result should not be -2. + in(0) = static_cast(-2); + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.minimum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.minimum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template +static void test_first_dim_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; @@ -101,33 +455,293 @@ static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_dev in.setRandom(); - redux= in.maximum(red_axis); + redux = in.maximum(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_first_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using reduced_tensor = Tensor; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + array reduced_range = {{num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = num_cols; + + data_tensor in(tensor_range); + reduced_tensor redux; + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast(2); + } + + Eigen::array red_axis; + red_axis[0] = 0; + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + redux = in_offset.maximum(red_axis); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(redux(i), in(i)); + } + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast( + sycl_device.allocate(n_reduced * sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, + n_reduced * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), redux(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_last_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using reduced_tensor = Tensor; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + array full_reduced_range = {{num_rows}}; + array reduced_range = {{num_rows - 1}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = reduced_range[0]; + + data_tensor in(tensor_range); + reduced_tensor redux(full_reduced_range); + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + redux.setZero(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast(2); + } - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + Eigen::array red_axis; + red_axis[0] = 1; + + const IndexType offset = 64; + // Introduce an offset in both the input and the output. + TensorMap in_offset(in.data() + offset, tensor_offset_range); + TensorMap red_offset(redux.data() + 1, reduced_range); + red_offset = in_offset.maximum(red_axis); + + // Check that the first value hasn't been changed and that the reduced values + // are not equal to the previously set maximum in the input outside the range. + VERIFY_IS_EQUAL(redux(0), static_cast(0)); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); + } - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast( + sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data + 1, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), + n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j +static void test_first_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { + array tensorRange = {{dim_x, dim_y}}; + Eigen::array red_axis; + red_axis[0] = 0; + array reduced_tensorRange = {{dim_y}}; + + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); + + in.setRandom(); + redux = in.sum(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < redux.size(); i++) { + VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); + } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template -static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) { +static void test_first_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; + + array tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array red_axis; + red_axis[0] = 0; + array reduced_tensorRange = {{dim_y, dim_z}}; + + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_last_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; + IndexType dim_y = 1; + IndexType dim_z = 32; + + array tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array red_axis; + red_axis[0] = 2; + array reduced_tensorRange = {{dim_x, dim_y}}; + + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); - IndexType dim_x = 567; + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_last_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; IndexType dim_y = 1; - IndexType dim_z = 47; + IndexType dim_z = 32; array tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array red_axis; @@ -140,42 +754,261 @@ static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_devi in.setRandom(); - redux= in.sum(red_axis); + redux = in.sum(red_axis); - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j +static void test_last_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + // auto red_axis = Sizes<0,1>(0,1); + Eigen::IndexList> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize, DataLayout> in_fix; + TensorFixedSize, DataLayout> redux_fix; + TensorFixedSize, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + + redux_fix = in_fix.sum(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); } -template void sycl_reduction_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_mean_sycl(sycl_device); +template +static void test_last_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + Eigen::IndexList> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize, DataLayout> in_fix; + TensorFixedSize, DataLayout> redux_fix; + TensorFixedSize, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + redux_fix = in_fix.mean(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + sycl_device.synchronize(); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +// SYCL supports a generic case of reduction where the accumulator is a +// different type than the input data This is an example on how to get if a +// Tensor contains nan and/or inf in one reduction +template +struct CustomReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + + static constexpr OutT InfBit = 1; + static constexpr OutT NanBit = 2; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, + OutT* accum) const { + if (Eigen::numext::isinf(x)) + *accum |= InfBit; + else if (Eigen::numext::isnan(x)) + *accum |= NanBit; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, + OutT* accum) const { + *accum |= x; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { + return OutT(0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { + return accum; + } +}; + +template +static void test_full_reductions_custom_sycl( + const Eigen::SyclDevice& sycl_device) { + constexpr IndexType InSize = 64; + auto tensorRange = Sizes(InSize); + Eigen::IndexList> dims; + auto reduced_tensorRange = Sizes<>(); + TensorFixedSize, DataLayout> in_fix; + TensorFixedSize, DataLayout> redux_gpu_fix; + + CustomReducer reducer; + + in_fix.setRandom(); + + size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); + DataType* gpu_in_data = + static_cast(sycl_device.allocate(in_size_bytes)); + AccumType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(AccumType))); + + TensorMap, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); + out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); + sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, + sizeof(AccumType)); + VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +void sycl_reduction_test_full_per_device(const Dev& sycl_device) { + test_full_reductions_sum_sycl(sycl_device); + test_full_reductions_sum_sycl(sycl_device); + test_full_reductions_min_sycl(sycl_device); test_full_reductions_min_sycl(sycl_device); + test_full_reductions_max_sycl(sycl_device); + test_full_reductions_max_sycl(sycl_device); + + test_full_reductions_mean_sycl(sycl_device); + test_full_reductions_mean_sycl(sycl_device); + test_full_reductions_custom_sycl( + sycl_device); + test_full_reductions_custom_sycl( + sycl_device); + sycl_device.synchronize(); +} + +template +void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { + test_full_reductions_sum_with_offset_sycl( + sycl_device); + test_full_reductions_sum_with_offset_sycl( + sycl_device); + test_full_reductions_min_with_offset_sycl( + sycl_device); + test_full_reductions_min_with_offset_sycl( + sycl_device); + test_full_reductions_max_with_offset_sycl( + sycl_device); + test_full_reductions_max_with_offset_sycl( + sycl_device); + test_full_reductions_mean_with_offset_sycl( + sycl_device); + test_full_reductions_mean_with_offset_sycl( + sycl_device); + test_full_reductions_mean_with_odd_offset_sycl( + sycl_device); + sycl_device.synchronize(); +} + +template +void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { + test_first_dim_reductions_sum_sycl(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl(sycl_device, + 129, 8); test_first_dim_reductions_max_sycl(sycl_device); + test_first_dim_reductions_max_with_offset_sycl( + sycl_device); + sycl_device.synchronize(); +} + +template +void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { test_last_dim_reductions_sum_sycl(sycl_device); - test_full_reductions_mean_sycl(sycl_device); - test_full_reductions_min_sycl(sycl_device); - test_first_dim_reductions_max_sycl(sycl_device); - test_last_dim_reductions_sum_sycl(sycl_device); + test_last_dim_reductions_max_with_offset_sycl( + sycl_device); + test_last_reductions_sum_sycl(sycl_device); + test_last_reductions_sum_sycl(sycl_device); + test_last_reductions_mean_sycl(sycl_device); + test_last_reductions_mean_sycl(sycl_device); + sycl_device.synchronize(); } + EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reduction_test_per_device(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(sycl_reduction_test_full_per_device(sycl_device)); + CALL_SUBTEST_2(sycl_reduction_full_offset_per_device(sycl_device)); + CALL_SUBTEST_3( + sycl_reduction_test_first_dim_per_device(sycl_device)); + CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device(sycl_device)); } } diff --git a/unsupported/test/cxx11_tensor_reverse_sycl.cpp b/unsupported/test/cxx11_tensor_reverse_sycl.cpp index 77c2235d1..dd30c235d 100644 --- a/unsupported/test/cxx11_tensor_reverse_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reverse_sycl.cpp @@ -20,10 +20,8 @@ #include "main.h" #include - template -static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { - +static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { IndexType dim1 = 2; IndexType dim2 = 3; IndexType dim3 = 5; @@ -40,21 +38,30 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[2] = true; dim_rev[3] = false; - DataType* gpu_in_data = static_cast(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =static_cast(sycl_device.allocate(reversed_tensor.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + reversed_tensor.dimensions().TotalSize() * sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, tensorRange); + TensorMap > in_gpu(gpu_in_data, + tensorRange); + TensorMap > out_gpu(gpu_out_data, + tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(i,2-j,4-k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(i, 2 - j, 4 - k, l)); } } } @@ -65,13 +72,15 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[3] = false; out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), reversed_tensor(1 - i, j, k, l)); } } } @@ -82,13 +91,16 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { dim_rev[2] = false; dim_rev[3] = true; out_gpu.device(sycl_device) = in_gpu.reverse(dim_rev); - sycl_device.memcpyDeviceToHost(reversed_tensor.data(), gpu_out_data, reversed_tensor.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + reversed_tensor.data(), gpu_out_data, + reversed_tensor.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < 2; ++i) { for (IndexType j = 0; j < 3; ++j) { for (IndexType k = 0; k < 5; ++k) { for (IndexType l = 0; l < 7; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), reversed_tensor(1-i,j,k,6-l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), + reversed_tensor(1 - i, j, k, 6 - l)); } } } @@ -98,11 +110,9 @@ static void test_simple_reverse(const Eigen::SyclDevice& sycl_device) { sycl_device.deallocate(gpu_out_data); } - - template -static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue) -{ +static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, + bool LValue) { IndexType dim1 = 2; IndexType dim2 = 3; IndexType dim3 = 5; @@ -120,24 +130,32 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dim_rev[2] = false; dim_rev[3] = true; - DataType* gpu_in_data = static_cast(sycl_device.allocate(tensor.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_expected =static_cast(sycl_device.allocate(expected.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data_result =static_cast(sycl_device.allocate(result.dimensions().TotalSize()*sizeof(DataType))); - - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu_expected(gpu_out_data_expected, tensorRange); - TensorMap > out_gpu_result(gpu_out_data_result, tensorRange); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(tensor.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_expected = static_cast(sycl_device.allocate( + expected.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data_result = static_cast( + sycl_device.allocate(result.dimensions().TotalSize() * sizeof(DataType))); + TensorMap > in_gpu(gpu_in_data, + tensorRange); + TensorMap > out_gpu_expected( + gpu_out_data_expected, tensorRange); + TensorMap > out_gpu_result( + gpu_out_data_result, tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, tensor.data(),(tensor.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, tensor.data(), + (tensor.dimensions().TotalSize()) * sizeof(DataType)); if (LValue) { out_gpu_expected.reverse(dim_rev).device(sycl_device) = in_gpu; } else { out_gpu_expected.device(sycl_device) = in_gpu.reverse(dim_rev); } - sycl_device.memcpyDeviceToHost(expected.data(), gpu_out_data_expected, expected.dimensions().TotalSize()*sizeof(DataType)); - + sycl_device.memcpyDeviceToHost( + expected.data(), gpu_out_data_expected, + expected.dimensions().TotalSize() * sizeof(DataType)); array src_slice_dim; src_slice_dim[0] = 2; @@ -154,8 +172,9 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue for (IndexType i = 0; i < 5; ++i) { if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(src_slice_start, src_slice_dim); + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim); } else { out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = in_gpu.slice(src_slice_start, src_slice_dim).reverse(dim_rev); @@ -163,13 +182,15 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue src_slice_start[2] += 1; dst_slice_start[2] += 1; } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < expected.dimension(0); ++i) { for (IndexType j = 0; j < expected.dimension(1); ++j) { for (IndexType k = 0; k < expected.dimension(2); ++k) { for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); } } } @@ -177,34 +198,37 @@ static void test_expr_reverse(const Eigen::SyclDevice& sycl_device, bool LValue dst_slice_start[2] = 0; result.setRandom(); - sycl_device.memcpyHostToDevice(gpu_out_data_result, result.data(),(result.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_out_data_result, result.data(), + (result.dimensions().TotalSize()) * sizeof(DataType)); for (IndexType i = 0; i < 5; ++i) { - if (LValue) { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).reverse(dim_rev).device(sycl_device) = - in_gpu.slice(dst_slice_start, dst_slice_dim); - } else { - out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = - in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); - } + if (LValue) { + out_gpu_result.slice(dst_slice_start, dst_slice_dim) + .reverse(dim_rev) + .device(sycl_device) = in_gpu.slice(dst_slice_start, dst_slice_dim); + } else { + out_gpu_result.slice(dst_slice_start, dst_slice_dim).device(sycl_device) = + in_gpu.reverse(dim_rev).slice(dst_slice_start, dst_slice_dim); + } dst_slice_start[2] += 1; } - sycl_device.memcpyDeviceToHost(result.data(), gpu_out_data_result, result.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + result.data(), gpu_out_data_result, + result.dimensions().TotalSize() * sizeof(DataType)); for (IndexType i = 0; i < expected.dimension(0); ++i) { for (IndexType j = 0; j < expected.dimension(1); ++j) { for (IndexType k = 0; k < expected.dimension(2); ++k) { for (IndexType l = 0; l < expected.dimension(3); ++l) { - VERIFY_IS_EQUAL(result(i,j,k,l), expected(i,j,k,l)); + VERIFY_IS_EQUAL(result(i, j, k, l), expected(i, j, k, l)); } } } } } - - -template void sycl_reverse_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info() << std::endl; +template +void sycl_reverse_test_per_device(const cl::sycl::device& d) { QueueInterface queueInterface(d); auto sycl_device = Eigen::SyclDevice(&queueInterface); test_simple_reverse(sycl_device); @@ -215,7 +239,15 @@ template void sycl_reverse_test_per_device(const cl::sycl::de test_expr_reverse(sycl_device, true); } EIGEN_DECLARE_TEST(cxx11_tensor_reverse_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reverse_test_per_device(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.get_info() << std::endl; + CALL_SUBTEST_1(sycl_reverse_test_per_device(device)); + CALL_SUBTEST_2(sycl_reverse_test_per_device(device)); + CALL_SUBTEST_3(sycl_reverse_test_per_device(device)); +#ifdef EIGEN_SYCL_DOUBLE_SUPPORT + CALL_SUBTEST_4(sycl_reverse_test_per_device(device)); +#endif + CALL_SUBTEST_5(sycl_reverse_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_scan_sycl.cpp b/unsupported/test/cxx11_tensor_scan_sycl.cpp new file mode 100644 index 000000000..09c45fce5 --- /dev/null +++ b/unsupported/test/cxx11_tensor_scan_sycl.cpp @@ -0,0 +1,141 @@ +// This file is part of Eigen, a lightweight C++ template library +// for linear algebra. +// +// Copyright (C) 2016 +// Mehdi Goli Codeplay Software Ltd. +// Ralph Potter Codeplay Software Ltd. +// Luke Iwanski Codeplay Software Ltd. +// Contact: +// +// This Source Code Form is subject to the terms of the Mozilla +// Public License v. 2.0. If a copy of the MPL was not distributed +// with this file, You can obtain one at http://mozilla.org/MPL/2.0/. + +#define EIGEN_TEST_NO_LONGDOUBLE +#define EIGEN_TEST_NO_COMPLEX +#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t +#define EIGEN_USE_SYCL + +#include "main.h" +#include + +using Eigen::Tensor; +typedef Tensor::DimensionPair DimPair; + +template +void test_sycl_cumsum(const Eigen::SyclDevice& sycl_device, IndexType m_size, + IndexType k_size, IndexType n_size, int consume_dim, + bool exclusive) { + static const DataType error_threshold = 1e-4f; + std::cout << "Testing for (" << m_size << "," << k_size << "," << n_size + << " consume_dim : " << consume_dim << ")" << std::endl; + Tensor t_input(m_size, k_size, n_size); + Tensor t_result(m_size, k_size, n_size); + Tensor t_result_gpu(m_size, k_size, + n_size); + + t_input.setRandom(); + std::size_t t_input_bytes = t_input.size() * sizeof(DataType); + std::size_t t_result_bytes = t_result.size() * sizeof(DataType); + + DataType* gpu_data_in = + static_cast(sycl_device.allocate(t_input_bytes)); + DataType* gpu_data_out = + static_cast(sycl_device.allocate(t_result_bytes)); + + array tensorRange = {{m_size, k_size, n_size}}; + TensorMap> gpu_t_input( + gpu_data_in, tensorRange); + TensorMap> gpu_t_result( + gpu_data_out, tensorRange); + sycl_device.memcpyHostToDevice(gpu_data_in, t_input.data(), t_input_bytes); + sycl_device.memcpyHostToDevice(gpu_data_out, t_input.data(), t_input_bytes); + + gpu_t_result.device(sycl_device) = gpu_t_input.cumsum(consume_dim, exclusive); + + t_result = t_input.cumsum(consume_dim, exclusive); + + sycl_device.memcpyDeviceToHost(t_result_gpu.data(), gpu_data_out, + t_result_bytes); + sycl_device.synchronize(); + + for (IndexType i = 0; i < t_result.size(); i++) { + if (static_cast(std::fabs(static_cast( + t_result(i) - t_result_gpu(i)))) < error_threshold) { + continue; + } + if (Eigen::internal::isApprox(t_result(i), t_result_gpu(i), + error_threshold)) { + continue; + } + std::cout << "mismatch detected at index " << i << " CPU : " << t_result(i) + << " vs SYCL : " << t_result_gpu(i) << std::endl; + assert(false); + } + sycl_device.deallocate(gpu_data_in); + sycl_device.deallocate(gpu_data_out); +} + +template +void sycl_scan_test_exclusive_dim0_per_device(const Dev& sycl_device) { + test_sycl_cumsum(sycl_device, 2049, 1023, 127, 0, + true); + test_sycl_cumsum(sycl_device, 2049, 1023, 127, 0, + true); +} +template +void sycl_scan_test_exclusive_dim1_per_device(const Dev& sycl_device) { + test_sycl_cumsum(sycl_device, 1023, 2049, 127, 1, + true); + test_sycl_cumsum(sycl_device, 1023, 2049, 127, 1, + true); +} +template +void sycl_scan_test_exclusive_dim2_per_device(const Dev& sycl_device) { + test_sycl_cumsum(sycl_device, 1023, 127, 2049, 2, + true); + test_sycl_cumsum(sycl_device, 1023, 127, 2049, 2, + true); +} +template +void sycl_scan_test_inclusive_dim0_per_device(const Dev& sycl_device) { + test_sycl_cumsum(sycl_device, 2049, 1023, 127, 0, + false); + test_sycl_cumsum(sycl_device, 2049, 1023, 127, 0, + false); +} +template +void sycl_scan_test_inclusive_dim1_per_device(const Dev& sycl_device) { + test_sycl_cumsum(sycl_device, 1023, 2049, 127, 1, + false); + test_sycl_cumsum(sycl_device, 1023, 2049, 127, 1, + false); +} +template +void sycl_scan_test_inclusive_dim2_per_device(const Dev& sycl_device) { + test_sycl_cumsum(sycl_device, 1023, 127, 2049, 2, + false); + test_sycl_cumsum(sycl_device, 1023, 127, 2049, 2, + false); +} +EIGEN_DECLARE_TEST(cxx11_tensor_scan_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1( + sycl_scan_test_exclusive_dim0_per_device(sycl_device)); + CALL_SUBTEST_2( + sycl_scan_test_exclusive_dim1_per_device(sycl_device)); + CALL_SUBTEST_3( + sycl_scan_test_exclusive_dim2_per_device(sycl_device)); + CALL_SUBTEST_4( + sycl_scan_test_inclusive_dim0_per_device(sycl_device)); + CALL_SUBTEST_5( + sycl_scan_test_inclusive_dim1_per_device(sycl_device)); + CALL_SUBTEST_6( + sycl_scan_test_inclusive_dim2_per_device(sycl_device)); + } +} diff --git a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp index 0e8cc3bd2..ca4e8b5ef 100644 --- a/unsupported/test/cxx11_tensor_shuffling_sycl.cpp +++ b/unsupported/test/cxx11_tensor_shuffling_sycl.cpp @@ -12,14 +12,12 @@ // Public License v. 2.0. If a copy of the MPL was not distributed // with this file, You can obtain one at http://mozilla.org/MPL/2.0/. - #define EIGEN_TEST_NO_LONGDOUBLE #define EIGEN_TEST_NO_COMPLEX #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL - #include "main.h" #include @@ -29,33 +27,33 @@ using Eigen::Tensor; using Eigen::TensorMap; template -static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) -{ +static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) { IndexType sizeDim1 = 2; IndexType sizeDim2 = 3; IndexType sizeDim3 = 5; IndexType sizeDim4 = 7; array tensorRange = {{sizeDim1, sizeDim2, sizeDim3, sizeDim4}}; - Tensor tensor(tensorRange); - Tensor no_shuffle(tensorRange); + Tensor tensor(tensorRange); + Tensor no_shuffle(tensorRange); tensor.setRandom(); - const size_t buffSize =tensor.size()*sizeof(DataType); + const size_t buffSize = tensor.size() * sizeof(DataType); array shuffles; shuffles[0] = 0; shuffles[1] = 1; shuffles[2] = 2; shuffles[3] = 3; - DataType* gpu_data1 = static_cast(sycl_device.allocate(buffSize)); - DataType* gpu_data2 = static_cast(sycl_device.allocate(buffSize)); - + DataType* gpu_data1 = static_cast(sycl_device.allocate(buffSize)); + DataType* gpu_data2 = static_cast(sycl_device.allocate(buffSize)); - TensorMap> gpu1(gpu_data1, tensorRange); - TensorMap> gpu2(gpu_data2, tensorRange); + TensorMap> gpu1(gpu_data1, + tensorRange); + TensorMap> gpu2(gpu_data2, + tensorRange); sycl_device.memcpyHostToDevice(gpu_data1, tensor.data(), buffSize); - gpu2.device(sycl_device)=gpu1.shuffle(shuffles); + gpu2.device(sycl_device) = gpu1.shuffle(shuffles); sycl_device.memcpyDeviceToHost(no_shuffle.data(), gpu_data2, buffSize); sycl_device.synchronize(); @@ -68,7 +66,7 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType j = 0; j < sizeDim2; ++j) { for (IndexType k = 0; k < sizeDim3; ++k) { for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), no_shuffle(i,j,k,l)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), no_shuffle(i, j, k, l)); } } } @@ -78,12 +76,14 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) shuffles[1] = 3; shuffles[2] = 1; shuffles[3] = 0; - array tensorrangeShuffle = {{sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; - Tensor shuffle(tensorrangeShuffle); - DataType* gpu_data3 = static_cast(sycl_device.allocate(buffSize)); - TensorMap> gpu3(gpu_data3, tensorrangeShuffle); - - gpu3.device(sycl_device)=gpu1.shuffle(shuffles); + array tensorrangeShuffle = { + {sizeDim3, sizeDim4, sizeDim2, sizeDim1}}; + Tensor shuffle(tensorrangeShuffle); + DataType* gpu_data3 = static_cast(sycl_device.allocate(buffSize)); + TensorMap> gpu3( + gpu_data3, tensorrangeShuffle); + + gpu3.device(sycl_device) = gpu1.shuffle(shuffles); sycl_device.memcpyDeviceToHost(shuffle.data(), gpu_data3, buffSize); sycl_device.synchronize(); @@ -96,24 +96,22 @@ static void test_simple_shuffling_sycl(const Eigen::SyclDevice& sycl_device) for (IndexType j = 0; j < sizeDim2; ++j) { for (IndexType k = 0; k < sizeDim3; ++k) { for (IndexType l = 0; l < sizeDim4; ++l) { - VERIFY_IS_EQUAL(tensor(i,j,k,l), shuffle(k,l,j,i)); + VERIFY_IS_EQUAL(tensor(i, j, k, l), shuffle(k, l, j, i)); } } } } } - -template void sycl_shuffling_test_per_device(dev_Selector s){ +template +void sycl_shuffling_test_per_device(dev_Selector s) { QueueInterface queueInterface(s); auto sycl_device = Eigen::SyclDevice(&queueInterface); test_simple_shuffling_sycl(sycl_device); test_simple_shuffling_sycl(sycl_device); - } -EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) -{ - for (const auto& device :Eigen::get_sycl_supported_devices()) { +EIGEN_DECLARE_TEST(cxx11_tensor_shuffling_sycl) { + for (const auto& device : Eigen::get_sycl_supported_devices()) { CALL_SUBTEST(sycl_shuffling_test_per_device(device)); } } diff --git a/unsupported/test/cxx11_tensor_sycl.cpp b/unsupported/test/cxx11_tensor_sycl.cpp index 9357bed02..e6c5e2378 100644 --- a/unsupported/test/cxx11_tensor_sycl.cpp +++ b/unsupported/test/cxx11_tensor_sycl.cpp @@ -29,9 +29,9 @@ using Eigen::TensorMap; template void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { - IndexType sizeDim1 = 100; - IndexType sizeDim2 = 10; - IndexType sizeDim3 = 20; + IndexType sizeDim1 = 5; + IndexType sizeDim2 = 5; + IndexType sizeDim3 = 1; array tensorRange = {{sizeDim1, sizeDim2, sizeDim3}}; Tensor in1(tensorRange); Tensor out1(tensorRange); @@ -56,6 +56,7 @@ void test_sycl_mem_transfers(const Eigen::SyclDevice &sycl_device) { sycl_device.synchronize(); for (IndexType i = 0; i < in1.size(); ++i) { + // std::cout << "SYCL DATA : " << out1(i) << " vs CPU DATA : " << in1(i) * 3.14f << "\n"; VERIFY_IS_APPROX(out1(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out2(i), in1(i) * 3.14f); VERIFY_IS_APPROX(out3(i), in1(i) * 2.7f); @@ -93,6 +94,88 @@ void test_sycl_mem_sync(const Eigen::SyclDevice &sycl_device) { sycl_device.deallocate(gpu_data); } +template +void test_sycl_mem_sync_offsets(const Eigen::SyclDevice &sycl_device) { + using tensor_type = Tensor; + IndexType full_size = 32; + IndexType half_size = full_size / 2; + array tensorRange = {{full_size}}; + tensor_type in1(tensorRange); + tensor_type out(tensorRange); + + DataType* gpu_data = static_cast(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap gpu1(gpu_data, tensorRange); + + in1 = in1.random(); + // Copy all data to device, then permute on copy back to host + sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data() + half_size, gpu_data, half_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + in1 = in1.random(); + out.setZero(); + // Permute copies to device, then copy all back to host + sycl_device.memcpyHostToDevice(gpu_data + half_size, in1.data(), half_size * sizeof(DataType)); + sycl_device.memcpyHostToDevice(gpu_data, in1.data() + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + in1 = in1.random(); + out.setZero(); + DataType* gpu_data_out = static_cast(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap gpu2(gpu_data_out, tensorRange); + // Copy all to device, permute copies on device, then copy all back to host + sycl_device.memcpyHostToDevice(gpu_data, in1.data(), full_size * sizeof(DataType)); + sycl_device.memcpy(gpu_data_out + half_size, gpu_data, half_size * sizeof(DataType)); + sycl_device.memcpy(gpu_data_out, gpu_data + half_size, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data_out, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < half_size; ++i) { + VERIFY_IS_APPROX(out(i), in1(i + half_size)); + VERIFY_IS_APPROX(out(i + half_size), in1(i)); + } + + sycl_device.deallocate(gpu_data_out); + sycl_device.deallocate(gpu_data); +} + +template +void test_sycl_memset_offsets(const Eigen::SyclDevice &sycl_device) { + using tensor_type = Tensor; + IndexType full_size = 32; + IndexType half_size = full_size / 2; + array tensorRange = {{full_size}}; + tensor_type cpu_out(tensorRange); + tensor_type out(tensorRange); + + cpu_out.setZero(); + + std::memset(cpu_out.data(), 0, half_size * sizeof(DataType)); + std::memset(cpu_out.data() + half_size, 1, half_size * sizeof(DataType)); + + DataType* gpu_data = static_cast(sycl_device.allocate(full_size * sizeof(DataType))); + TensorMap gpu1(gpu_data, tensorRange); + + sycl_device.memset(gpu_data, 0, half_size * sizeof(DataType)); + sycl_device.memset(gpu_data + half_size, 1, half_size * sizeof(DataType)); + sycl_device.memcpyDeviceToHost(out.data(), gpu_data, full_size * sizeof(DataType)); + + for (IndexType i = 0; i < full_size; ++i) { + VERIFY_IS_APPROX(out(i), cpu_out(i)); + } + + sycl_device.deallocate(gpu_data); +} + template void test_sycl_computations(const Eigen::SyclDevice &sycl_device) { @@ -262,6 +345,8 @@ template void sycl_computing_test_per_ test_sycl_mem_transfers(sycl_device); test_sycl_computations(sycl_device); test_sycl_mem_sync(sycl_device); + test_sycl_mem_sync_offsets(sycl_device); + test_sycl_memset_offsets(sycl_device); test_sycl_mem_transfers(sycl_device); test_sycl_computations(sycl_device); test_sycl_mem_sync(sycl_device); -- cgit v1.2.3