From 00f32752f7d0b193c6788691c3cf0b76457a044d Mon Sep 17 00:00:00 2001 From: Mehdi Goli Date: Thu, 28 Nov 2019 10:08:54 +0000 Subject: [SYCL] Rebasing the SYCL support branch on top of the Einge upstream master branch. * Unifying all loadLocalTile from lhs and rhs to an extract_block function. * Adding get_tensor operation which was missing in TensorContractionMapper. * Adding the -D method missing from cmake for Disable_Skinny Contraction operation. * Wrapping all the indices in TensorScanSycl into Scan parameter struct. * Fixing typo in Device SYCL * Unifying load to private register for tall/skinny no shared * Unifying load to vector tile for tensor-vector/vector-tensor operation * Removing all the LHS/RHS class for extracting data from global * Removing Outputfunction from TensorContractionSkinnyNoshared. * Combining the local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining the no-local memory version of tall/skinny and normal tensor contraction into one kernel. * Combining General Tensor-Vector and VectorTensor contraction into one kernel. * Making double buffering optional for Tensor contraction when local memory is version is used. * Modifying benchmark to accept custom Reduction Sizes * Disabling AVX optimization for SYCL backend on the host to allow SSE optimization to the host * Adding Test for SYCL * Modifying SYCL CMake --- unsupported/test/cxx11_tensor_reduction_sycl.cpp | 941 +++++++++++++++++++++-- 1 file changed, 887 insertions(+), 54 deletions(-) (limited to 'unsupported/test/cxx11_tensor_reduction_sycl.cpp') diff --git a/unsupported/test/cxx11_tensor_reduction_sycl.cpp b/unsupported/test/cxx11_tensor_reduction_sycl.cpp index f526299c6..a297716e4 100644 --- a/unsupported/test/cxx11_tensor_reduction_sycl.cpp +++ b/unsupported/test/cxx11_tensor_reduction_sycl.cpp @@ -16,16 +16,99 @@ #define EIGEN_DEFAULT_DENSE_INDEX_TYPE int64_t #define EIGEN_USE_SYCL +#define EIGEN_HAS_CONSTEXPR 1 #include "main.h" + #include +template +static void test_full_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 753; + const IndexType num_cols = 537; + array tensorRange = {{num_rows, num_cols}}; + + array outRange = {{1, 1}}; + + Tensor in(tensorRange); + Tensor full_redux(outRange); + Tensor full_redux_gpu(outRange); + + in.setRandom(); + auto dim = DSizes(1, 1); + full_redux = in.sum().reshape(dim); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate( + sizeof(DataType) * (full_redux_gpu.dimensions().TotalSize())); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu(gpu_out_data, + outRange); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum().reshape(dim); + sycl_device.memcpyDeviceToHost( + full_redux_gpu.data(), gpu_out_data, + (full_redux_gpu.dimensions().TotalSize()) * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL FULL :" << full_redux_gpu(0, 0) + << ", CPU FULL: " << full_redux(0, 0) << "\n"; + VERIFY_IS_APPROX(full_redux_gpu(0, 0), full_redux(0, 0)); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template -static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_sum_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.sum(); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - const IndexType num_rows = 452; - const IndexType num_cols = 765; + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_full_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; array tensorRange = {{num_rows, num_cols}}; Tensor in(tensorRange); @@ -34,27 +117,250 @@ static void test_full_reductions_mean_sycl(const Eigen::SyclDevice& sycl_device in.setRandom(); - full_redux = in.mean(); + full_redux = in.maximum(); - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_full_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the max. + // As we don't include this in the reduction the result should not be 2. + in(0) = static_cast(2); + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.maximum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); - out_gpu.device(sycl_device) = in_gpu.mean(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } +template +static void test_full_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + const IndexType num_rows = 4096; + const IndexType num_cols = 4096; + array tensorRange = {{num_rows, num_cols}}; + array argRange = {{num_cols}}; + Eigen::array red_axis; + red_axis[0] = 0; + // red_axis[1]=1; + Tensor in(tensorRange); + Tensor in_arg1(tensorRange); + Tensor in_arg2(tensorRange); + Tensor out_arg_cpu(argRange); + Tensor out_arg_gpu(argRange); + Tensor out_arg_gpu_helper(argRange); + Tensor full_redux; + Tensor full_redux_gpu; + + in.setRandom(); + in_arg1.setRandom(); + in_arg2.setRandom(); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg1_data = static_cast(sycl_device.allocate( + in_arg1.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_in_arg2_data = static_cast(sycl_device.allocate( + in_arg2.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg__gpu_helper_data = static_cast(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + bool* gpu_out_arg_data = static_cast(sycl_device.allocate( + out_arg_gpu.dimensions().TotalSize() * sizeof(DataType))); + + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> in_Arg1_gpu( + gpu_in_arg1_data, tensorRange); + TensorMap> in_Arg2_gpu( + gpu_in_arg2_data, tensorRange); + TensorMap> out_Argout_gpu( + gpu_out_arg_data, argRange); + TensorMap> out_Argout_gpu_helper( + gpu_out_arg__gpu_helper_data, argRange); + TensorMap> out_gpu(gpu_out_data); + + // CPU VERSION + out_arg_cpu = + (in_arg1.argmax(1) == in_arg2.argmax(1)) + .select(out_arg_cpu.constant(true), out_arg_cpu.constant(false)); + full_redux = (out_arg_cpu.template cast()) + .reduce(red_axis, Eigen::internal::MeanReducer()); + + // GPU VERSION + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg1_data, in_arg1.data(), + (in_arg1.dimensions().TotalSize()) * sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_arg2_data, in_arg2.data(), + (in_arg2.dimensions().TotalSize()) * sizeof(DataType)); + out_Argout_gpu_helper.device(sycl_device) = + (in_Arg1_gpu.argmax(1) == in_Arg2_gpu.argmax(1)); + out_Argout_gpu.device(sycl_device) = + (out_Argout_gpu_helper) + .select(out_Argout_gpu.constant(true), + out_Argout_gpu.constant(false)); + out_gpu.device(sycl_device) = + (out_Argout_gpu.template cast()) + .reduce(red_axis, Eigen::internal::MeanReducer()); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + std::cout << "SYCL : " << full_redux_gpu() << " , CPU : " << full_redux() + << '\n'; + VERIFY_IS_EQUAL(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_in_arg1_data); + sycl_device.deallocate(gpu_in_arg2_data); + sycl_device.deallocate(gpu_out_arg__gpu_helper_data); + sycl_device.deallocate(gpu_out_arg_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_full_reductions_mean_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.mean(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} template -static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_mean_with_odd_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + // This is a particular case which illustrates a possible problem when the + // number of local threads in a workgroup is even, but is not a power of two. + using data_tensor = Tensor; + using scalar_tensor = Tensor; + // 2177 = (17 * 128) + 1 gives rise to 18 local threads. + // 8708 = 4 * 2177 = 4 * (17 * 128) + 4 uses 18 vectorised local threads. + const IndexType n_elems = 8707; + array tensor_range = {{n_elems}}; + + data_tensor in(tensor_range); + DataType full_redux; + DataType full_redux_gpu; + TensorMap red_cpu(&full_redux); + TensorMap red_gpu(&full_redux_gpu); + + const DataType const_val = static_cast(0.6391); + in = in.constant(const_val); + + Eigen::IndexList> red_axis; + red_cpu = in.reduce(red_axis, Eigen::internal::MeanReducer()); + VERIFY_IS_APPROX(const_val, red_cpu()); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data, tensor_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = + in_gpu.reduce(red_axis, Eigen::internal::MeanReducer()); + sycl_device.memcpyDeviceToHost(red_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu, full_redux); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template +static void test_full_reductions_min_sycl( + const Eigen::SyclDevice& sycl_device) { const IndexType num_rows = 876; const IndexType num_cols = 953; array tensorRange = {{num_rows, num_cols}}; @@ -67,25 +373,73 @@ static void test_full_reductions_min_sycl(const Eigen::SyclDevice& sycl_device) full_redux = in.minimum(); - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data =(DataType*)sycl_device.allocate(sizeof(DataType)); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = (DataType*)sycl_device.allocate(sizeof(DataType)); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu(gpu_out_data); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.minimum(); - sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, sizeof(DataType)); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } - template -static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_device) { +static void test_full_reductions_min_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using scalar_tensor = Tensor; + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + + data_tensor in(tensor_range); + scalar_tensor full_redux; + scalar_tensor full_redux_gpu; + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set the initial value to be the min. + // As we don't include this in the reduction the result should not be -2. + in(0) = static_cast(-2); + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + full_redux = in_offset.minimum(); + VERIFY_IS_NOT_EQUAL(full_redux(), in(0)); + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.minimum(); + sycl_device.memcpyDeviceToHost(full_redux_gpu.data(), gpu_out_data, + sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + VERIFY_IS_APPROX(full_redux_gpu(), full_redux()); + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} +template +static void test_first_dim_reductions_max_sycl( + const Eigen::SyclDevice& sycl_device) { IndexType dim_x = 145; IndexType dim_y = 1; IndexType dim_z = 67; @@ -101,33 +455,293 @@ static void test_first_dim_reductions_max_sycl(const Eigen::SyclDevice& sycl_dev in.setRandom(); - redux= in.maximum(red_axis); + redux = in.maximum(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_first_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using reduced_tensor = Tensor; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + array reduced_range = {{num_cols}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = num_cols; + + data_tensor in(tensor_range); + reduced_tensor redux; + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast(2); + } + + Eigen::array red_axis; + red_axis[0] = 0; + + const IndexType offset = 64; + TensorMap in_offset(in.data() + offset, tensor_offset_range); + redux = in_offset.maximum(red_axis); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(redux(i), in(i)); + } + + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast( + sycl_device.allocate(n_reduced * sizeof(DataType))); + + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, + n_reduced * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_APPROX(redux_gpu(i), redux(i)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_last_dim_reductions_max_with_offset_sycl( + const Eigen::SyclDevice& sycl_device) { + using data_tensor = Tensor; + using reduced_tensor = Tensor; + + const IndexType num_rows = 64; + const IndexType num_cols = 64; + array tensor_range = {{num_rows, num_cols}}; + array full_reduced_range = {{num_rows}}; + array reduced_range = {{num_rows - 1}}; + const IndexType n_elems = internal::array_prod(tensor_range); + const IndexType n_reduced = reduced_range[0]; + + data_tensor in(tensor_range); + reduced_tensor redux(full_reduced_range); + reduced_tensor redux_gpu(reduced_range); + + in.setRandom(); + redux.setZero(); + array tensor_offset_range(tensor_range); + tensor_offset_range[0] -= 1; + // Set maximum value outside of the considered range. + for (IndexType i = 0; i < n_reduced; i++) { + in(i) = static_cast(2); + } - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + Eigen::array red_axis; + red_axis[0] = 1; + + const IndexType offset = 64; + // Introduce an offset in both the input and the output. + TensorMap in_offset(in.data() + offset, tensor_offset_range); + TensorMap red_offset(redux.data() + 1, reduced_range); + red_offset = in_offset.maximum(red_axis); + + // Check that the first value hasn't been changed and that the reduced values + // are not equal to the previously set maximum in the input outside the range. + VERIFY_IS_EQUAL(redux(0), static_cast(0)); + for (IndexType i = 0; i < n_reduced; i++) { + VERIFY_IS_NOT_EQUAL(red_offset(i), in(i)); + } - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); + DataType* gpu_in_data = + static_cast(sycl_device.allocate(n_elems * sizeof(DataType))); + DataType* gpu_out_data = static_cast( + sycl_device.allocate((n_reduced + 1) * sizeof(DataType))); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + TensorMap in_gpu(gpu_in_data + offset, tensor_offset_range); + TensorMap out_gpu(gpu_out_data + 1, reduced_range); + sycl_device.memcpyHostToDevice(gpu_in_data, in.data(), + n_elems * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.maximum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost(redux_gpu.data(), out_gpu.data(), + n_reduced * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j +static void test_first_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device, IndexType dim_x, IndexType dim_y) { + array tensorRange = {{dim_x, dim_y}}; + Eigen::array red_axis; + red_axis[0] = 0; + array reduced_tensorRange = {{dim_y}}; + + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); + + in.setRandom(); + redux = in.sum(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType i = 0; i < redux.size(); i++) { + VERIFY_IS_APPROX(redux_gpu.data()[i], redux.data()[i]); + } sycl_device.deallocate(gpu_in_data); sycl_device.deallocate(gpu_out_data); } template -static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_device) { +static void test_first_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 145; + IndexType dim_y = 1; + IndexType dim_z = 67; + + array tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array red_axis; + red_axis[0] = 0; + array reduced_tensorRange = {{dim_y, dim_z}}; + + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_last_dim_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; + IndexType dim_y = 1; + IndexType dim_z = 32; + + array tensorRange = {{dim_x, dim_y, dim_z}}; + Eigen::array red_axis; + red_axis[0] = 2; + array reduced_tensorRange = {{dim_x, dim_y}}; + + Tensor in(tensorRange); + Tensor redux(reduced_tensorRange); + Tensor redux_gpu(reduced_tensorRange); + + in.setRandom(); + + redux = in.mean(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); - IndexType dim_x = 567; + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu.device(sycl_device) = in_gpu.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) + for (IndexType k = 0; k < reduced_tensorRange[1]; k++) + VERIFY_IS_APPROX(redux_gpu(j, k), redux(j, k)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +static void test_last_dim_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + IndexType dim_x = 64; IndexType dim_y = 1; - IndexType dim_z = 47; + IndexType dim_z = 32; array tensorRange = {{dim_x, dim_y, dim_z}}; Eigen::array red_axis; @@ -140,42 +754,261 @@ static void test_last_dim_reductions_sum_sycl(const Eigen::SyclDevice &sycl_devi in.setRandom(); - redux= in.sum(red_axis); + redux = in.sum(red_axis); - DataType* gpu_in_data = static_cast(sycl_device.allocate(in.dimensions().TotalSize()*sizeof(DataType))); - DataType* gpu_out_data = static_cast(sycl_device.allocate(redux_gpu.dimensions().TotalSize()*sizeof(DataType))); + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu.dimensions().TotalSize() * sizeof(DataType))); - TensorMap > in_gpu(gpu_in_data, tensorRange); - TensorMap > out_gpu(gpu_out_data, reduced_tensorRange); + TensorMap> in_gpu(gpu_in_data, + tensorRange); + TensorMap> out_gpu( + gpu_out_data, reduced_tensorRange); - sycl_device.memcpyHostToDevice(gpu_in_data, in.data(),(in.dimensions().TotalSize())*sizeof(DataType)); + sycl_device.memcpyHostToDevice( + gpu_in_data, in.data(), (in.dimensions().TotalSize()) * sizeof(DataType)); out_gpu.device(sycl_device) = in_gpu.sum(red_axis); - sycl_device.memcpyDeviceToHost(redux_gpu.data(), gpu_out_data, redux_gpu.dimensions().TotalSize()*sizeof(DataType)); + sycl_device.memcpyDeviceToHost( + redux_gpu.data(), gpu_out_data, + redux_gpu.dimensions().TotalSize() * sizeof(DataType)); // Check that the CPU and GPU reductions return the same result. - for(IndexType j=0; j +static void test_last_reductions_sum_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + // auto red_axis = Sizes<0,1>(0,1); + Eigen::IndexList> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize, DataLayout> in_fix; + TensorFixedSize, DataLayout> redux_fix; + TensorFixedSize, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + + redux_fix = in_fix.sum(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.sum(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); } -template void sycl_reduction_test_per_device(const cl::sycl::device& d){ - std::cout << "Running on " << d.template get_info() << std::endl; - QueueInterface queueInterface(d); - auto sycl_device = Eigen::SyclDevice(&queueInterface); - test_full_reductions_mean_sycl(sycl_device); +template +static void test_last_reductions_mean_sycl( + const Eigen::SyclDevice& sycl_device) { + auto tensorRange = Sizes<64, 32>(64, 32); + Eigen::IndexList> red_axis; + auto reduced_tensorRange = Sizes<64>(64); + TensorFixedSize, DataLayout> in_fix; + TensorFixedSize, DataLayout> redux_fix; + TensorFixedSize, DataLayout> redux_gpu_fix; + + in_fix.setRandom(); + redux_fix = in_fix.mean(red_axis); + + DataType* gpu_in_data = static_cast( + sycl_device.allocate(in_fix.dimensions().TotalSize() * sizeof(DataType))); + DataType* gpu_out_data = static_cast(sycl_device.allocate( + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType))); + + TensorMap, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice( + gpu_in_data, in_fix.data(), + (in_fix.dimensions().TotalSize()) * sizeof(DataType)); + out_gpu_fix.device(sycl_device) = in_gpu_fix.mean(red_axis); + sycl_device.memcpyDeviceToHost( + redux_gpu_fix.data(), gpu_out_data, + redux_gpu_fix.dimensions().TotalSize() * sizeof(DataType)); + sycl_device.synchronize(); + // Check that the CPU and GPU reductions return the same result. + for (IndexType j = 0; j < reduced_tensorRange[0]; j++) { + VERIFY_IS_APPROX(redux_gpu_fix(j), redux_fix(j)); + } + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +// SYCL supports a generic case of reduction where the accumulator is a +// different type than the input data This is an example on how to get if a +// Tensor contains nan and/or inf in one reduction +template +struct CustomReducer { + static const bool PacketAccess = false; + static const bool IsStateful = false; + + static constexpr OutT InfBit = 1; + static constexpr OutT NanBit = 2; + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const InT x, + OutT* accum) const { + if (Eigen::numext::isinf(x)) + *accum |= InfBit; + else if (Eigen::numext::isnan(x)) + *accum |= NanBit; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE void reduce(const OutT x, + OutT* accum) const { + *accum |= x; + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT initialize() const { + return OutT(0); + } + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE OutT finalize(const OutT accum) const { + return accum; + } +}; + +template +static void test_full_reductions_custom_sycl( + const Eigen::SyclDevice& sycl_device) { + constexpr IndexType InSize = 64; + auto tensorRange = Sizes(InSize); + Eigen::IndexList> dims; + auto reduced_tensorRange = Sizes<>(); + TensorFixedSize, DataLayout> in_fix; + TensorFixedSize, DataLayout> redux_gpu_fix; + + CustomReducer reducer; + + in_fix.setRandom(); + + size_t in_size_bytes = in_fix.dimensions().TotalSize() * sizeof(DataType); + DataType* gpu_in_data = + static_cast(sycl_device.allocate(in_size_bytes)); + AccumType* gpu_out_data = + static_cast(sycl_device.allocate(sizeof(AccumType))); + + TensorMap, DataLayout>> in_gpu_fix( + gpu_in_data, tensorRange); + TensorMap, DataLayout>> out_gpu_fix( + gpu_out_data, reduced_tensorRange); + + sycl_device.memcpyHostToDevice(gpu_in_data, in_fix.data(), in_size_bytes); + out_gpu_fix.device(sycl_device) = in_gpu_fix.reduce(dims, reducer); + sycl_device.memcpyDeviceToHost(redux_gpu_fix.data(), gpu_out_data, + sizeof(AccumType)); + VERIFY_IS_EQUAL(redux_gpu_fix(0), AccumType(0)); + + sycl_device.deallocate(gpu_in_data); + sycl_device.deallocate(gpu_out_data); +} + +template +void sycl_reduction_test_full_per_device(const Dev& sycl_device) { + test_full_reductions_sum_sycl(sycl_device); + test_full_reductions_sum_sycl(sycl_device); + test_full_reductions_min_sycl(sycl_device); test_full_reductions_min_sycl(sycl_device); + test_full_reductions_max_sycl(sycl_device); + test_full_reductions_max_sycl(sycl_device); + + test_full_reductions_mean_sycl(sycl_device); + test_full_reductions_mean_sycl(sycl_device); + test_full_reductions_custom_sycl( + sycl_device); + test_full_reductions_custom_sycl( + sycl_device); + sycl_device.synchronize(); +} + +template +void sycl_reduction_full_offset_per_device(const Dev& sycl_device) { + test_full_reductions_sum_with_offset_sycl( + sycl_device); + test_full_reductions_sum_with_offset_sycl( + sycl_device); + test_full_reductions_min_with_offset_sycl( + sycl_device); + test_full_reductions_min_with_offset_sycl( + sycl_device); + test_full_reductions_max_with_offset_sycl( + sycl_device); + test_full_reductions_max_with_offset_sycl( + sycl_device); + test_full_reductions_mean_with_offset_sycl( + sycl_device); + test_full_reductions_mean_with_offset_sycl( + sycl_device); + test_full_reductions_mean_with_odd_offset_sycl( + sycl_device); + sycl_device.synchronize(); +} + +template +void sycl_reduction_test_first_dim_per_device(const Dev& sycl_device) { + test_first_dim_reductions_sum_sycl(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl(sycl_device, + 4197, 4097); + test_first_dim_reductions_sum_sycl(sycl_device, + 129, 8); test_first_dim_reductions_max_sycl(sycl_device); + test_first_dim_reductions_max_with_offset_sycl( + sycl_device); + sycl_device.synchronize(); +} + +template +void sycl_reduction_test_last_dim_per_device(const Dev& sycl_device) { test_last_dim_reductions_sum_sycl(sycl_device); - test_full_reductions_mean_sycl(sycl_device); - test_full_reductions_min_sycl(sycl_device); - test_first_dim_reductions_max_sycl(sycl_device); - test_last_dim_reductions_sum_sycl(sycl_device); + test_last_dim_reductions_max_with_offset_sycl( + sycl_device); + test_last_reductions_sum_sycl(sycl_device); + test_last_reductions_sum_sycl(sycl_device); + test_last_reductions_mean_sycl(sycl_device); + test_last_reductions_mean_sycl(sycl_device); + sycl_device.synchronize(); } + EIGEN_DECLARE_TEST(cxx11_tensor_reduction_sycl) { - for (const auto& device :Eigen::get_sycl_supported_devices()) { - CALL_SUBTEST(sycl_reduction_test_per_device(device)); + for (const auto& device : Eigen::get_sycl_supported_devices()) { + std::cout << "Running on " + << device.template get_info() + << std::endl; + QueueInterface queueInterface(device); + auto sycl_device = Eigen::SyclDevice(&queueInterface); + CALL_SUBTEST_1(sycl_reduction_test_full_per_device(sycl_device)); + CALL_SUBTEST_2(sycl_reduction_full_offset_per_device(sycl_device)); + CALL_SUBTEST_3( + sycl_reduction_test_first_dim_per_device(sycl_device)); + CALL_SUBTEST_4(sycl_reduction_test_last_dim_per_device(sycl_device)); } } -- cgit v1.2.3